def __init__(self, pilotAgentsDB=None, jobDB=None, tqDB=None, jlDB=None, opsHelper=None): """ c'tor """ if pilotAgentsDB: self.pilotAgentsDB = pilotAgentsDB else: self.pilotAgentsDB = PilotAgentsDB() if jobDB: self.jobDB = jobDB else: self.jobDB = JobDB() if tqDB: self.tqDB = tqDB else: self.tqDB = TaskQueueDB() if jlDB: self.jlDB = jlDB else: self.jlDB = JobLoggingDB() if opsHelper: self.opsHelper = opsHelper else: self.opsHelper = Operations() self.log = gLogger.getSubLogger("Matcher") self.limiter = Limiter(jobDB=self.jobDB, opsHelper=self.opsHelper) self.siteClient = SiteStatus()
def initialize(self): """ Standard constructor """ self.am_setOption("PollingTime", 60.0) self.am_setOption("maxPilotWaitingHours", 6) self.queueDict = {} self.maxJobsInFillMode = MAX_JOBS_IN_FILLMODE self.maxPilotsToSubmit = MAX_PILOTS_TO_SUBMIT self.siteStatus = SiteStatus() return S_OK()
def _updateSiteMask( self, sitesData ): siteStatus = SiteStatus() siteMaskStatus = dict( sitesData ) for site in siteMaskStatus: # #FIXME: we are only taking into account ComputingAccess # if siteStatus.isUsableSite( site, 'ComputingAccess' ): siteMaskStatus[ site ][ 'siteMaskStatus' ] = 'Allowed' else: siteMaskStatus[ site ][ 'siteMaskStatus' ] = 'Banned' sitesData[ site ][ 'siteMaskStatus' ] = siteMaskStatus[ site ][ 'siteMaskStatus' ] return S_OK( sitesData )
def _updateSiteMask(self, sitesData): siteStatus = SiteStatus() siteMaskStatus = dict(sitesData) for site in siteMaskStatus: # #FIXME: we are only taking into account ComputingAccess # if siteStatus.isUsableSite(site, 'ComputingAccess'): siteMaskStatus[site]['siteMaskStatus'] = 'Allowed' else: siteMaskStatus[site]['siteMaskStatus'] = 'Banned' sitesData[site]['siteMaskStatus'] = siteMaskStatus[site][ 'siteMaskStatus'] return S_OK(sitesData)
def initialize(self): """ Standard initialize. """ maxNumberOfThreads = self.am_getOption('maxNumberOfThreads', self.__maxNumberOfThreads) self.threadPool = ThreadPool(maxNumberOfThreads, maxNumberOfThreads) self.siteClient = SiteStatus() self.clients['SiteStatus'] = self.siteClient self.clients['ResourceManagementClient'] = ResourceManagementClient() return S_OK()
def __init__(self): """Internal initialization of the DIRAC Admin API.""" super(DiracAdmin, self).__init__() self.csAPI = CSAPI() self.dbg = False if gConfig.getValue(self.section + "/LogLevel", "DEBUG") == "DEBUG": self.dbg = True self.scratchDir = gConfig.getValue(self.section + "/ScratchDir", "/tmp") self.currentDir = os.getcwd() self.rssFlag = ResourceStatus().rssFlag self.sitestatus = SiteStatus()
class InputDataValidation( OptimizerExecutor ): """ The specific Optimizer must provide the following methods: - initializeOptimizer() before each execution cycle - checkJob() - the main method called for each job """ @classmethod def initializeOptimizer( cls ): """ Initialization of the Agent. """ random.seed() cls.__SEStatus = DictCache.DictCache() cls.__sitesForSE = DictCache.DictCache() try: from DIRAC.WorkloadManagementSystem.DB.JobDB import JobDB except ImportError, excp : return S_ERROR( "Could not import JobDB: %s" % str( excp ) ) try: cls.__jobDB = JobDB() except RuntimeError: return S_ERROR( "Cannot connect to JobDB" ) cls.__siteStatus = SiteStatus() cls.ex_setOption( "FailedStatus", "Input Data Not Available" ) return S_OK()
def __init__(self): """Internal initialization of the DIRAC Admin API. """ super(DiracAdmin, self).__init__() self.csAPI = CSAPI() self.dbg = False if gConfig.getValue(self.section + '/LogLevel', 'DEBUG') == 'DEBUG': self.dbg = True self.scratchDir = gConfig.getValue(self.section + '/ScratchDir', '/tmp') self.currentDir = os.getcwd() self.rssFlag = ResourceStatus().rssFlag self.sitestatus = SiteStatus() self._siteSet = set(getSites().get('Value', []))
def printCEInfo(voName): resultQueues = Resources.getQueues(community=voName) if not resultQueues["OK"]: gLogger.error("Failed to get CE information") DIRACExit(-1) fields = ("Site", "CE", "CEType", "Queue", "Status") records = [] # get list of usable sites within this cycle resultMask = SiteStatus().getUsableSites() if not resultMask["OK"]: return resultMask siteMaskList = resultMask.get("Value", []) rssClient = ResourceStatus() for site in resultQueues["Value"]: siteStatus = "Active" if site in siteMaskList else "InActive" siteNew = True for ce in resultQueues["Value"][site]: ceStatus = siteStatus if rssClient.rssFlag: result = rssClient.getElementStatus(ce, "ComputingElement") if result["OK"]: ceStatus = result["Value"][ce]["all"] ceNew = True for queue in resultQueues["Value"][site][ce]["Queues"]: pSite = site if siteNew else "" pCE = "" ceType = "" if ceNew: pCE = ce ceType = resultQueues["Value"][site][ce]["CEType"] records.append((pSite, pCE, ceType, queue, ceStatus)) ceNew = False siteNew = False gLogger.notice( printTable(fields, records, printOut=False, columnSeparator=" ")) return S_OK()
def printCEInfo(voName): resultQueues = Resources.getQueues(community=voName) if not resultQueues['OK']: gLogger.error('Failed to get CE information') DIRACExit(-1) fields = ("Site", 'CE', 'CEType', 'Queue', 'Status') records = [] # get list of usable sites within this cycle resultMask = SiteStatus().getUsableSites() if not resultMask['OK']: return resultMask siteMaskList = resultMask.get('Value', []) rssClient = ResourceStatus() for site in resultQueues['Value']: siteStatus = "Active" if site in siteMaskList else "InActive" siteNew = True for ce in resultQueues['Value'][site]: ceStatus = siteStatus if rssClient.rssFlag: result = rssClient.getElementStatus(ce, "ComputingElement") if result['OK']: ceStatus = result['Value'][ce]['all'] ceNew = True for queue in resultQueues['Value'][site][ce]['Queues']: pSite = site if siteNew else '' pCE = '' ceType = '' if ceNew: pCE = ce ceType = resultQueues['Value'][site][ce]['CEType'] records.append((pSite, pCE, ceType, queue, ceStatus)) ceNew = False siteNew = False gLogger.notice( printTable(fields, records, printOut=False, columnSeparator=' ')) return S_OK()
def __init__( self ): """ Constructor, initializes the logger, rssClient and caches. examples >>> resourceStatus = ResourceStatus() """ super( ResourceStatus, self ).__init__() self.siteStatus = SiteStatus() # We can set CacheLifetime and CacheHistory from CS, so that we can tune them. cacheLifeTime = int( RssConfiguration().getConfigCache() ) # RSSCaches, one per elementType ( StorageElement, ComputingElement ) # Should be generated on the fly, instead of being hardcoded ? self.seCache = RSSCache( 'Storage', cacheLifeTime, self._updateSECache ) self.ceCache = RSSCache( 'Computing', cacheLifeTime, self._updateCECache )
def initialize( self ): """ Standard constructor """ self.am_setOption( "PollingTime", 60.0 ) self.am_setOption( "maxPilotWaitingHours", 6 ) self.queueDict = {} self.maxJobsInFillMode = MAX_JOBS_IN_FILLMODE self.maxPilotsToSubmit = MAX_PILOTS_TO_SUBMIT self.siteStatus = SiteStatus() return S_OK()
def __init__( self, submitPool ): """ Define the logger and some defaults """ if submitPool == self.gridMiddleware: self.log = gLogger.getSubLogger( '%sPilotDirector' % self.gridMiddleware ) else: self.log = gLogger.getSubLogger( '%sPilotDirector/%s' % ( self.gridMiddleware, submitPool ) ) self.pilot = DIRAC_PILOT self.submitPoolOption = '-o /Resources/Computing/CEDefaults/SubmitPool=%s' % submitPool self.extraPilotOptions = [] self.installVersion = DIRAC_VERSION self.installProject = DIRAC_PROJECT self.installation = DIRAC_INSTALLATION self.pilotExtensionsList = [] self.virtualOrganization = VIRTUAL_ORGANIZATION self.install = DIRAC_INSTALL self.extraModules = DIRAC_MODULES self.maxJobsInFillMode = MAX_JOBS_IN_FILLMODE self.targetGrids = [ self.gridMiddleware ] self.enableListMatch = ENABLE_LISTMATCH self.listMatchDelay = LISTMATCH_DELAY self.listMatchCache = DictCache() self.privatePilotFraction = PRIVATE_PILOT_FRACTION self.errorClearTime = ERROR_CLEAR_TIME self.errorTicketTime = ERROR_TICKET_TIME self.errorMailAddress = DIRAC.errorMail self.alarmMailAddress = DIRAC.alarmMail self.mailFromAddress = FROM_MAIL self.siteClient = SiteStatus() if not 'log' in self.__dict__: self.log = gLogger.getSubLogger( 'PilotDirector' ) self.log.info( 'Initialized' )
def __checkSitesInMask( self, job, siteCandidates ): """Returns list of site candidates that are in current mask. """ siteStatus = SiteStatus() result = siteStatus.getUsableSites( 'ComputingAccess' ) if not result['OK']: return S_ERROR( 'Could not get site mask' ) sites = [] usableSites = result['Value'] for candidate in siteCandidates: if not candidate in usableSites: self.log.verbose( '%s is a candidate site for job %s but not in mask' % ( candidate, job ) ) else: sites.append( candidate ) self.log.info( 'Candidate sites in Mask are %s' % ( sites ) ) return S_OK( sites )
def getSiteMask( self, printOutput = False ): """Retrieve current site mask from WMS Administrator service. Example usage: >>> print diracAdmin.getSiteMask() {'OK': True, 'Value': 0L} :returns: S_OK,S_ERROR """ siteStatus = SiteStatus() result = siteStatus.getUsableSites( 'ComputingAccess' ) if result['OK']: sites = result['Value'] if printOutput: sites.sort() for site in sites: print site return result
def initialize( self ): """ Standard initialize. """ maxNumberOfThreads = self.am_getOption( 'maxNumberOfThreads', self.__maxNumberOfThreads ) self.threadPool = ThreadPool( maxNumberOfThreads, maxNumberOfThreads ) self.siteClient = SiteStatus() self.clients['SiteStatus'] = self.siteClient self.clients['ResourceManagementClient'] = ResourceManagementClient() return S_OK()
def __checkSitesInMask(self, job, siteCandidates): """Returns list of site candidates that are in current mask. """ siteStatus = SiteStatus() result = siteStatus.getUsableSites('ComputingAccess') if not result['OK']: return S_ERROR('Could not get site mask') sites = [] usableSites = result['Value'] for candidate in siteCandidates: if not candidate in usableSites: self.log.verbose( '%s is a candidate site for job %s but not in mask' % (candidate, job)) else: sites.append(candidate) self.log.info('Candidate sites in Mask are %s' % (sites)) return S_OK(sites)
def getSiteMask(self, printOutput=False): """Retrieve current site mask from WMS Administrator service. Example usage: >>> print diracAdmin.getSiteMask() {'OK': True, 'Value': 0L} :returns: S_OK,S_ERROR """ siteStatus = SiteStatus() result = siteStatus.getUsableSites('ComputingAccess') if result['OK']: sites = result['Value'] if printOutput: sites.sort() for site in sites: print site return result
def getBannedSites( self, printOutput = False ): """Retrieve current list of banned sites. Example usage: >>> print diracAdmin.getBannedSites() {'OK': True, 'Value': []} :returns: S_OK,S_ERROR """ siteStatus = SiteStatus() result = siteStatus.getUnusableSites( 'ComputingAccess' ) if not result['OK']: self.log.warn( result['Message'] ) return result bannedSites = result['Value'] bannedSites.sort() if printOutput: print '\n'.join( bannedSites ) return S_OK( bannedSites )
def getBannedSites(self, printOutput=False): """Retrieve current list of banned sites. Example usage: >>> print diracAdmin.getBannedSites() {'OK': True, 'Value': []} :returns: S_OK,S_ERROR """ siteStatus = SiteStatus() result = siteStatus.getUnusableSites('ComputingAccess') if not result['OK']: self.log.warn(result['Message']) return result bannedSites = result['Value'] bannedSites.sort() if printOutput: print '\n'.join(bannedSites) return S_OK(bannedSites)
def __init__(self): """Internal initialization of the DIRAC Admin API. """ super(DiracAdmin, self).__init__() self.csAPI = CSAPI() self.dbg = False if gConfig.getValue(self.section + '/LogLevel', 'DEBUG') == 'DEBUG': self.dbg = True self.scratchDir = gConfig.getValue(self.section + '/ScratchDir', '/tmp') self.currentDir = os.getcwd() self.rssFlag = ResourceStatus().rssFlag self.sitestatus = SiteStatus()
def optimizeJob( self, jid, jobState ): # Reschedule delay result = jobState.getAttributes( [ 'RescheduleCounter', 'RescheduleTime', 'ApplicationStatus' ] ) if not result[ 'OK' ]: return result attDict = result[ 'Value' ] try: reschedules = int( attDict[ 'RescheduleCounter' ] ) except ValueError: return S_ERROR( "RescheduleCounter has to be an integer" ) if reschedules != 0: delays = self.ex_getOption( 'RescheduleDelays', [60, 180, 300, 600] ) delay = delays[ min( reschedules, len( delays ) - 1 ) ] waited = toEpoch() - toEpoch( fromString( attDict[ 'RescheduleTime' ] ) ) if waited < delay: return self.__holdJob( jobState, 'On Hold: after rescheduling %s' % reschedules, delay ) # Get site requirements result = self._getSitesRequired( jobState ) if not result[ 'OK' ]: return result userSites, userBannedSites = result[ 'Value' ] # Get active and banned sites from DIRAC siteStatus = SiteStatus() result = siteStatus.getUsableSites( 'ComputingAccess' ) if not result[ 'OK' ]: return S_ERROR( "Cannot retrieve active sites from JobDB" ) usableSites = result[ 'Value' ] result = siteStatus.getUnusableSites( 'ComputingAccess' ) if not result[ 'OK' ]: return S_ERROR( "Cannot retrieve banned sites from JobDB" ) unusableSites = result[ 'Value' ] # If the user has selected any site, filter them and hold the job if not able to run if userSites: result = jobState.getAttribute( "JobType" ) if not result[ 'OK' ]: return S_ERROR( "Could not retrieve job type" ) jobType = result[ 'Value' ] if jobType not in self.ex_getOption( 'ExcludedOnHoldJobTypes', [] ): sites = self._applySiteFilter( userSites, usableSites, unusableSites ) if not sites: return self.__holdJob( jobState, "Sites %s are inactive or banned" % ", ".join( userSites ) ) # Get the Input data # Third, check if there is input data result = jobState.getInputData() if not result['OK']: self.jobLog.error( "Cannot get input data %s" % ( result['Message'] ) ) return S_ERROR( 'Failed to get input data from JobDB' ) if not result['Value']: # No input data? Generate requirements and next return self.__sendToTQ( jobState, userSites, userBannedSites ) inputData = result[ 'Value' ] self.jobLog.verbose( 'Has an input data requirement' ) idAgent = self.ex_getOption( 'InputDataAgent', 'InputData' ) result = self.retrieveOptimizerParam( idAgent ) if not result['OK']: self.jobLog.error( "Could not retrieve input data info: %s" % result[ 'Message' ] ) return S_ERROR( "File Catalog Access Failure" ) opData = result[ 'Value' ] if 'SiteCandidates' not in opData: return S_ERROR( "No possible site candidates" ) # Filter input data sites with user requirement siteCandidates = list( opData[ 'SiteCandidates' ] ) self.jobLog.info( "Site candidates are %s" % siteCandidates ) siteCandidates = self._applySiteFilter( siteCandidates, userSites, userBannedSites ) if not siteCandidates: return S_ERROR( "Impossible InputData * Site requirements" ) idSites = {} for site in siteCandidates: idSites[ site ] = opData[ 'SiteCandidates' ][ site ] #Check if sites have correct count of disk+tape replicas numData = len( inputData ) errorSites = set() for site in idSites: if numData != idSites[ site ][ 'disk' ] + idSites[ site ][ 'tape' ]: self.jobLog.error( "Site candidate %s does not have all the input data" % site ) errorSites.add( site ) for site in errorSites: idSites.pop( site ) if not idSites: return S_ERROR( "Site candidates do not have all the input data" ) #Check if staging is required stageRequired, siteCandidates = self.__resolveStaging( jobState, inputData, idSites ) if not siteCandidates: return S_ERROR( "No destination sites available" ) # Is any site active? stageSites = self._applySiteFilter( siteCandidates, usableSites, unusableSites ) if not stageSites: return self.__holdJob( jobState, "Sites %s are inactive or banned" % ", ".join( siteCandidates ) ) # If no staging is required send to TQ if not stageRequired: # Use siteCandidates and not stageSites because active and banned sites # will be taken into account on matching time return self.__sendToTQ( jobState, siteCandidates, userBannedSites ) # Check if the user is allowed to stage if self.ex_getOption( "RestrictDataStage", False ): if not self.__checkStageAllowed( jobState ): return S_ERROR( "Stage not allowed" ) # Get stageSites[0] because it has already been randomized and it's as good as any in stageSites stageSite = stageSites[0] self.jobLog.verbose( " Staging site will be %s" % ( stageSite ) ) stageData = idSites[ stageSite ] # Set as if everything has already been staged stageData[ 'disk' ] += stageData[ 'tape' ] stageData[ 'tape' ] = 0 # Set the site info back to the original dict to save afterwards opData[ 'SiteCandidates' ][ stageSite ] = stageData result = self.__requestStaging( jobState, stageSite, opData ) if not result[ 'OK' ]: return result stageLFNs = result[ 'Value' ] self._updateSharedSESites( stageSite, stageLFNs, opData ) # Save the optimizer data again self.jobLog.verbose( 'Updating %s Optimizer Info:' % ( idAgent ), opData ) result = self.storeOptimizerParam( idAgent, opData ) if not result[ 'OK' ]: return result return self._setJobSite( jobState, stageSites )
class SiteDirector(AgentModule): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ def initialize(self): """ Standard constructor """ self.am_setOption("PollingTime", 60.0) self.am_setOption("maxPilotWaitingHours", 6) self.queueDict = {} self.maxJobsInFillMode = MAX_JOBS_IN_FILLMODE self.maxPilotsToSubmit = MAX_PILOTS_TO_SUBMIT self.siteStatus = SiteStatus() return S_OK() def beginExecution(self): self.gridEnv = self.am_getOption("GridEnv", getGridEnv()) # The SiteDirector is for a particular user community self.vo = self.am_getOption("Community", '') if not self.vo: self.vo = CSGlobals.getVO() # The SiteDirector is for a particular user group self.group = self.am_getOption("Group", '') # self.voGroups contain all the eligible user groups for pilots submutted by this SiteDirector self.voGroups = [] # Choose the group for which pilots will be submitted. This is a hack until # we will be able to match pilots to VOs. if not self.group: if self.vo: result = Registry.getGroupsForVO(self.vo) if not result['OK']: return result for group in result['Value']: if 'NormalUser' in Registry.getPropertiesForGroup(group): self.voGroups.append(group) else: self.voGroups = [self.group] result = findGenericPilotCredentials(vo=self.vo) if not result['OK']: return result self.pilotDN, self.pilotGroup = result['Value'] self.pilotDN = self.am_getOption("PilotDN", self.pilotDN) self.pilotGroup = self.am_getOption("PilotGroup", self.pilotGroup) self.platforms = [] self.sites = [] self.defaultSubmitPools = '' if self.group: self.defaultSubmitPools = Registry.getGroupOption( self.group, 'SubmitPools', '') elif self.vo: self.defaultSubmitPools = Registry.getVOOption( self.vo, 'SubmitPools', '') self.pilot = self.am_getOption('PilotScript', DIRAC_PILOT) self.install = DIRAC_INSTALL self.workingDirectory = self.am_getOption('WorkDirectory') self.maxQueueLength = self.am_getOption('MaxQueueLength', 86400 * 3) self.pilotLogLevel = self.am_getOption('PilotLogLevel', 'INFO') self.maxJobsInFillMode = self.am_getOption('MaxJobsInFillMode', self.maxJobsInFillMode) self.maxPilotsToSubmit = self.am_getOption('MaxPilotsToSubmit', self.maxPilotsToSubmit) self.pilotWaitingFlag = self.am_getOption('PilotWaitingFlag', True) self.pilotWaitingTime = self.am_getOption('MaxPilotWaitingTime', 7200) # Flags self.updateStatus = self.am_getOption('UpdatePilotStatus', True) self.getOutput = self.am_getOption('GetPilotOutput', True) self.sendAccounting = self.am_getOption('SendPilotAccounting', True) # Get the site description dictionary siteNames = None if not self.am_getOption('Site', 'Any').lower() == "any": siteNames = self.am_getOption('Site', []) ceTypes = None if not self.am_getOption('CETypes', 'Any').lower() == "any": ceTypes = self.am_getOption('CETypes', []) ces = None if not self.am_getOption('CEs', 'Any').lower() == "any": ces = self.am_getOption('CEs', []) self._resources = Resources.Resources(vo=self.vo) result = self._resources.getEligibleQueuesInfo(siteList=siteNames, ceList=ces, ceTypeList=ceTypes, mode='Direct') if not result['OK']: return result resourceDict = result['Value'] result = self.getQueues(resourceDict) if not result['OK']: return result #if not siteNames: # siteName = gConfig.getValue( '/DIRAC/Site', 'Unknown' ) # if siteName == 'Unknown': # return S_OK( 'No site specified for the SiteDirector' ) # else: # siteNames = [siteName] #self.siteNames = siteNames if self.updateStatus: self.log.always('Pilot status update requested') if self.getOutput: self.log.always('Pilot output retrieval requested') if self.sendAccounting: self.log.always('Pilot accounting sending requested') self.log.always('Sites:', siteNames) self.log.always('CETypes:', ceTypes) self.log.always('CEs:', ces) self.log.always('PilotDN:', self.pilotDN) self.log.always('PilotGroup:', self.pilotGroup) self.log.always('MaxPilotsToSubmit:', self.maxPilotsToSubmit) self.log.always('MaxJobsInFillMode:', self.maxJobsInFillMode) self.localhost = socket.getfqdn() self.proxy = '' if self.queueDict: self.log.always("Agent will serve queues:") for queue in self.queueDict: self.log.always("Site: %s, CE: %s, Queue: %s" % (self.queueDict[queue]['Site'], self.queueDict[queue]['CEName'], queue)) return S_OK() def getQueues(self, resourceDict): """ Get the list of relevant CEs and their descriptions """ self.queueDict = {} ceFactory = ComputingElementFactory() for site in resourceDict: result = self._resources.getSiteFullName(site) if not result['OK']: continue siteFullName = result['Value'] for ce in resourceDict[site]: ceDict = resourceDict[site][ce] qDict = ceDict.pop('Queues') for queue in qDict: queueName = '%s_%s' % (ce, queue) self.queueDict[queueName] = {} self.queueDict[queueName]['ParametersDict'] = qDict[queue] self.queueDict[queueName]['ParametersDict'][ 'Queue'] = queue self.queueDict[queueName]['ParametersDict'][ 'Site'] = siteFullName self.queueDict[queueName]['ParametersDict'][ 'GridEnv'] = self.gridEnv self.queueDict[queueName]['ParametersDict'][ 'Setup'] = gConfig.getValue('/DIRAC/Setup', 'unknown') # Evaluate the CPU limit of the queue according to the Glue convention # To Do: should be a utility if "maxCPUTime" in self.queueDict[queueName]['ParametersDict'] and \ "SI00" in self.queueDict[queueName]['ParametersDict']: maxCPUTime = float(self.queueDict[queueName] ['ParametersDict']['maxCPUTime']) # For some sites there are crazy values in the CS maxCPUTime = max(maxCPUTime, 0) maxCPUTime = min(maxCPUTime, 86400 * 12.5) si00 = float(self.queueDict[queueName] ['ParametersDict']['SI00']) queueCPUTime = 60. / 250. * maxCPUTime * si00 self.queueDict[queueName]['ParametersDict'][ 'CPUTime'] = int(queueCPUTime) qwDir = os.path.join(self.workingDirectory, queue) if not os.path.exists(qwDir): os.makedirs(qwDir) self.queueDict[queueName]['ParametersDict'][ 'WorkingDirectory'] = qwDir platform = '' if "Platform" in self.queueDict[queueName][ 'ParametersDict']: platform = self.queueDict[queueName]['ParametersDict'][ 'Platform'] elif "Platform" in ceDict: platform = ceDict['Platform'] elif "OS" in ceDict: architecture = ceDict.get('architecture', 'x86_64') OS = ceDict['OS'] platform = '_'.join([architecture, OS]) if platform and not platform in self.platforms: self.platforms.append(platform) if not "Platform" in self.queueDict[queueName][ 'ParametersDict'] and platform: result = Resources.getDIRACPlatform(platform) if result['OK']: self.queueDict[queueName]['ParametersDict'][ 'Platform'] = result['Value'] ceQueueDict = dict(ceDict) ceQueueDict.update( self.queueDict[queueName]['ParametersDict']) result = ceFactory.getCE(ceName=ce, ceType=ceDict['CEType'], ceParametersDict=ceQueueDict) if not result['OK']: return result self.queueDict[queueName]['CE'] = result['Value'] self.queueDict[queueName]['CEName'] = ce self.queueDict[queueName]['CEType'] = ceDict['CEType'] self.queueDict[queueName]['Site'] = siteFullName self.queueDict[queueName]['QueueName'] = queue self.queueDict[queueName]['Platform'] = platform result = self.queueDict[queueName]['CE'].isValid() if not result['OK']: self.log.fatal(result['Message']) return result if 'BundleProxy' in self.queueDict[queueName][ 'ParametersDict']: self.queueDict[queueName]['BundleProxy'] = True elif 'BundleProxy' in ceDict: self.queueDict[queueName]['BundleProxy'] = True if siteFullName not in self.sites: self.sites.append(siteFullName) return S_OK() def execute(self): """ Main execution method """ if not self.queueDict: self.log.warn('No site defined, exiting the cycle') return S_OK() result = self.submitJobs() if not result['OK']: self.log.error('Errors in the job submission: ', result['Message']) if self.updateStatus: result = self.updatePilotStatus() if not result['OK']: self.log.error('Errors in updating pilot status: ', result['Message']) return S_OK() def submitJobs(self): """ Go through defined computing elements and submit jobs if necessary """ # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = { 'Setup': setup, 'CPUTime': 9999999, 'SubmitPool': self.defaultSubmitPools } if self.vo: tqDict['Community'] = self.vo if self.voGroups: tqDict['OwnerGroup'] = self.voGroups result = Resources.getCompatiblePlatforms(self.platforms) if not result['OK']: return result tqDict['Platform'] = result['Value'] tqDict['Site'] = self.sites self.log.verbose('Checking overall TQ availability with requirements') self.log.verbose(tqDict) rpcMatcher = RPCClient("WorkloadManagement/Matcher") result = rpcMatcher.getMatchingTaskQueues(tqDict) if not result['OK']: return result if not result['Value']: self.log.verbose('No Waiting jobs suitable for the director') return S_OK() queues = self.queueDict.keys() random.shuffle(queues) for queue in queues: ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] ceType = self.queueDict[queue]['CEType'] queueName = self.queueDict[queue]['QueueName'] siteName = self.queueDict[queue]['Site'] siteMask = self.siteStatus.isUsableSite(siteName, 'ComputingAccess') platform = self.queueDict[queue]['Platform'] if 'CPUTime' in self.queueDict[queue]['ParametersDict']: queueCPUTime = int( self.queueDict[queue]['ParametersDict']['CPUTime']) else: self.log.warn( 'CPU time limit is not specified for queue %s, skipping...' % queue) continue if queueCPUTime > self.maxQueueLength: queueCPUTime = self.maxQueueLength # Get the working proxy cpuTime = queueCPUTime + 86400 self.log.verbose("Getting pilot proxy for %s/%s %d long" % (self.pilotDN, self.pilotGroup, cpuTime)) result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, cpuTime) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy(self.proxy, cpuTime - 60) # Get the number of available slots on the target site/queue result = ce.available() if not result['OK']: self.log.warn( 'Failed to check the availability of queue %s: \n%s' % (queue, result['Message'])) continue ceInfoDict = result['CEInfoDict'] self.log.info( "CE queue report(%s_%s): Wait=%d, Run=%d, Submitted=%d, Max=%d" % \ ( ceName, queueName, ceInfoDict['WaitingJobs'], ceInfoDict['RunningJobs'], ceInfoDict['SubmittedJobs'], ceInfoDict['MaxTotalJobs'] ) ) totalSlots = result['Value'] ceDict = ce.getParameterDict() ceDict['GridCE'] = ceName if not siteMask and 'Site' in ceDict: self.log.info('Site not in the mask %s' % siteName) self.log.info('Removing "Site" from matching Dict') del ceDict['Site'] if self.vo: ceDict['Community'] = self.vo if self.voGroups: ceDict['OwnerGroup'] = self.voGroups # This is a hack to get rid of ! ceDict['SubmitPool'] = self.defaultSubmitPools result = Resources.getCompatiblePlatforms(platform) if not result['OK']: continue ceDict['Platform'] = result['Value'] # Get the number of eligible jobs for the target site/queue result = rpcMatcher.getMatchingTaskQueues(ceDict) if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message']) return result taskQueueDict = result['Value'] if not taskQueueDict: self.log.info('No matching TQs found') continue totalTQJobs = 0 tqIDList = taskQueueDict.keys() for tq in taskQueueDict: totalTQJobs += taskQueueDict[tq]['Jobs'] pilotsToSubmit = min(totalSlots, totalTQJobs) # Get the number of already waiting pilots for this queue totalWaitingPilots = 0 if self.pilotWaitingFlag: lastUpdateTime = dateTime() - self.pilotWaitingTime * second result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList, 'Status': WAITING_PILOT_STATUS }, None, lastUpdateTime) if not result['OK']: self.log.error('Failed to get Number of Waiting pilots', result['Message']) totalWaitingPilots = 0 else: totalWaitingPilots = result['Value'] self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % tqIDList, totalWaitingPilots) pilotsToSubmit = max( 0, min(totalSlots, totalTQJobs - totalWaitingPilots)) self.log.info( 'Available slots=%d, TQ jobs=%d, Waiting Pilots=%d, Pilots to submit=%d' % \ ( totalSlots, totalTQJobs, totalWaitingPilots, pilotsToSubmit ) ) # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT pilotsToSubmit = min(self.maxPilotsToSubmit, pilotsToSubmit) while pilotsToSubmit > 0: self.log.info('Going to submit %d pilots to %s queue' % (pilotsToSubmit, queue)) bundleProxy = self.queueDict[queue].get('BundleProxy', False) jobExecDir = '' if ceType == 'CREAM': jobExecDir = '.' jobExecDir = self.queueDict[queue].get('JobExecDir', jobExecDir) httpProxy = self.queueDict[queue].get('HttpProxy', '') result = self.__getExecutable(queue, pilotsToSubmit, bundleProxy, httpProxy, jobExecDir) if not result['OK']: return result executable, pilotSubmissionChunk = result['Value'] result = ce.submitJob(executable, '', pilotSubmissionChunk) os.unlink(executable) if not result['OK']: self.log.error('Failed submission to queue %s:\n' % queue, result['Message']) pilotsToSubmit = 0 continue pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the # task queue priorities pilotList = result['Value'] self.log.info('Submitted %d pilots to %s@%s' % (len(pilotList), queueName, ceName)) stampDict = {} if result.has_key('PilotStampDict'): stampDict = result['PilotStampDict'] tqPriorityList = [] sumPriority = 0. for tq in taskQueueDict: sumPriority += taskQueueDict[tq]['Priority'] tqPriorityList.append((tq, sumPriority)) rndm = random.random() * sumPriority tqDict = {} for pilotID in pilotList: rndm = random.random() * sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if not tqDict.has_key(tqID): tqDict[tqID] = [] tqDict[tqID].append(pilotID) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference( pilotList, tqID, self.pilotDN, self.pilotGroup, self.localhost, ceType, '', stampDict) if not result['OK']: self.log.error( 'Failed add pilots to the PilotAgentsDB: ', result['Message']) continue for pilot in pilotList: result = pilotAgentsDB.setPilotStatus( pilot, 'Submitted', ceName, 'Successfully submitted by the SiteDirector', siteName, queueName) if not result['OK']: self.log.error('Failed to set pilot status: ', result['Message']) continue return S_OK() ##################################################################################### def __getExecutable(self, queue, pilotsToSubmit, bundleProxy=True, httpProxy='', jobExecDir=''): """ Prepare the full executable for queue """ proxy = None if bundleProxy: proxy = self.proxy pilotOptions, pilotsToSubmit = self.__getPilotOptions( queue, pilotsToSubmit) if pilotOptions is None: return S_ERROR('Errors in compiling pilot options') executable = self.__writePilotScript(self.workingDirectory, pilotOptions, proxy, httpProxy, jobExecDir) return S_OK([executable, pilotsToSubmit]) ##################################################################################### def __getPilotOptions(self, queue, pilotsToSubmit): """ Prepare pilot options """ queueDict = self.queueDict[queue]['ParametersDict'] pilotOptions = [] setup = gConfig.getValue("/DIRAC/Setup", "unknown") if setup == 'unknown': self.log.error('Setup is not defined in the configuration') return [None, None] pilotOptions.append('-S %s' % setup) opsHelper = Operations.Operations(group=self.pilotGroup, setup=setup) #Installation defined? installationName = opsHelper.getValue("Pilot/Installation", "") if installationName: pilotOptions.append('-V %s' % installationName) #Project defined? projectName = opsHelper.getValue("Pilot/Project", "") if projectName: pilotOptions.append('-l %s' % projectName) else: self.log.info('DIRAC project will be installed by pilots') #Request a release diracVersion = opsHelper.getValue("Pilot/Version", []) if not diracVersion: self.log.error('Pilot/Version is not defined in the configuration') return [None, None] #diracVersion is a list of accepted releases. Just take the first one pilotOptions.append('-r %s' % diracVersion[0]) ownerDN = self.pilotDN ownerGroup = self.pilotGroup # Request token for maximum pilot efficiency result = gProxyManager.requestToken( ownerDN, ownerGroup, pilotsToSubmit * self.maxJobsInFillMode) if not result['OK']: self.log.error('Invalid proxy token request', result['Message']) return [None, None] (token, numberOfUses) = result['Value'] pilotOptions.append('-o /Security/ProxyToken=%s' % token) # Use Filling mode pilotOptions.append('-M %s' % min(numberOfUses, self.maxJobsInFillMode)) # Since each pilot will execute min( numberOfUses, self.maxJobsInFillMode ) # with numberOfUses tokens we can submit at most: # numberOfUses / min( numberOfUses, self.maxJobsInFillMode ) # pilots newPilotsToSubmit = numberOfUses / min(numberOfUses, self.maxJobsInFillMode) if newPilotsToSubmit != pilotsToSubmit: self.log.info( 'Number of pilots to submit is changed to %d after getting the proxy token' % newPilotsToSubmit) pilotsToSubmit = newPilotsToSubmit # Debug if self.pilotLogLevel.lower() == 'debug': pilotOptions.append('-d') # CS Servers csServers = gConfig.getValue("/DIRAC/Configuration/Servers", []) pilotOptions.append('-C %s' % ",".join(csServers)) # DIRAC Extensions to be used in pilots pilotExtensionsList = opsHelper.getValue("Pilot/Extensions", []) extensionsList = [] if pilotExtensionsList: if pilotExtensionsList[0] != 'None': extensionsList = pilotExtensionsList else: extensionsList = CSGlobals.getCSExtensions() if extensionsList: pilotOptions.append('-e %s' % ",".join(extensionsList)) # Requested CPU time pilotOptions.append('-T %s' % queueDict['CPUTime']) # CEName pilotOptions.append('-N %s' % self.queueDict[queue]['CEName']) # SiteName pilotOptions.append('-n %s' % queueDict['Site']) if 'ClientPlatform' in queueDict: pilotOptions.append("-p '%s'" % queueDict['ClientPlatform']) if 'SharedArea' in queueDict: pilotOptions.append("-o '/LocalSite/SharedArea=%s'" % queueDict['SharedArea']) if 'SI00' in queueDict: factor = float(queueDict['SI00']) / 250. pilotOptions.append("-o '/LocalSite/CPUScalingFactor=%s'" % factor) pilotOptions.append("-o '/LocalSite/CPUNormalizationFactor=%s'" % factor) else: if 'CPUScalingFactor' in queueDict: pilotOptions.append("-o '/LocalSite/CPUScalingFactor=%s'" % queueDict['CPUScalingFactor']) if 'CPUNormalizationFactor' in queueDict: pilotOptions.append( "-o '/LocalSite/CPUNormalizationFactor=%s'" % queueDict['CPUNormalizationFactor']) # Hack if self.defaultSubmitPools: pilotOptions.append( '-o /Resources/Computing/CEDefaults/SubmitPool=%s' % self.defaultSubmitPools) if self.group: pilotOptions.append('-G %s' % self.group) self.log.verbose("pilotOptions: ", ' '.join(pilotOptions)) return [pilotOptions, pilotsToSubmit] ##################################################################################### def __writePilotScript(self, workingDirectory, pilotOptions, proxy=None, httpProxy='', pilotExecDir=''): """ Bundle together and write out the pilot executable script, admixt the proxy if given """ try: compressedAndEncodedProxy = '' proxyFlag = 'False' if proxy is not None: compressedAndEncodedProxy = base64.encodestring( bz2.compress(proxy.dumpAllToString()['Value'])) proxyFlag = 'True' compressedAndEncodedPilot = base64.encodestring( bz2.compress(open(self.pilot, "rb").read(), 9)) compressedAndEncodedInstall = base64.encodestring( bz2.compress(open(self.install, "rb").read(), 9)) except: self.log.exception( 'Exception during file compression of proxy, dirac-pilot or dirac-install' ) return S_ERROR( 'Exception during file compression of proxy, dirac-pilot or dirac-install' ) localPilot = """#!/bin/bash /usr/bin/env python << EOF # import os, tempfile, sys, shutil, base64, bz2 try: pilotExecDir = '%(pilotExecDir)s' if not pilotExecDir: pilotExecDir = None pilotWorkingDirectory = tempfile.mkdtemp( suffix = 'pilot', prefix = 'DIRAC_', dir = pilotExecDir ) pilotWorkingDirectory = os.path.realpath( pilotWorkingDirectory ) os.chdir( pilotWorkingDirectory ) if %(proxyFlag)s: open( 'proxy', "w" ).write(bz2.decompress( base64.decodestring( \"\"\"%(compressedAndEncodedProxy)s\"\"\" ) ) ) os.chmod("proxy",0600) os.environ["X509_USER_PROXY"]=os.path.join(pilotWorkingDirectory, 'proxy') open( '%(pilotScript)s', "w" ).write(bz2.decompress( base64.decodestring( \"\"\"%(compressedAndEncodedPilot)s\"\"\" ) ) ) open( '%(installScript)s', "w" ).write(bz2.decompress( base64.decodestring( \"\"\"%(compressedAndEncodedInstall)s\"\"\" ) ) ) os.chmod("%(pilotScript)s",0700) os.chmod("%(installScript)s",0700) if "LD_LIBRARY_PATH" not in os.environ: os.environ["LD_LIBRARY_PATH"]="" if "%(httpProxy)s": os.environ["HTTP_PROXY"]="%(httpProxy)s" os.environ["X509_CERT_DIR"]=os.path.join(pilotWorkingDirectory, 'etc/grid-security/certificates') # TODO: structure the output print '===========================================================' print 'Environment of execution host' for key in os.environ.keys(): print key + '=' + os.environ[key] print '===========================================================' except Exception, x: print >> sys.stderr, x sys.exit(-1) cmd = "python %(pilotScript)s %(pilotOptions)s" print 'Executing: ', cmd sys.stdout.flush() os.system( cmd ) shutil.rmtree( pilotWorkingDirectory ) EOF """ % { 'compressedAndEncodedProxy': compressedAndEncodedProxy, 'compressedAndEncodedPilot': compressedAndEncodedPilot, 'compressedAndEncodedInstall': compressedAndEncodedInstall, 'httpProxy': httpProxy, 'pilotExecDir': pilotExecDir, 'pilotScript': os.path.basename(self.pilot), 'installScript': os.path.basename(self.install), 'pilotOptions': ' '.join(pilotOptions), 'proxyFlag': proxyFlag } fd, name = tempfile.mkstemp(suffix='_pilotwrapper.py', prefix='DIRAC_', dir=workingDirectory) pilotWrapper = os.fdopen(fd, 'w') pilotWrapper.write(localPilot) pilotWrapper.close() return name def updatePilotStatus(self): """ Update status of pilots in transient states """ for queue in self.queueDict: ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] queueName = self.queueDict[queue]['QueueName'] ceType = self.queueDict[queue]['CEType'] siteName = self.queueDict[queue]['Site'] result = pilotAgentsDB.selectPilots({ 'DestinationSite': ceName, 'Queue': queueName, 'GridType': ceType, 'GridSite': siteName, 'Status': TRANSIENT_PILOT_STATUS, 'OwnerDN': self.pilotDN, 'OwnerGroup': self.pilotGroup }) if not result['OK']: self.log.error('Failed to select pilots: %s' % result['Message']) continue pilotRefs = result['Value'] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo(pilotRefs) if not result['OK']: self.log.error('Failed to get pilots info from DB', result['Message']) continue pilotDict = result['Value'] stampedPilotRefs = [] for pRef in pilotDict: if pilotDict[pRef]['PilotStamp']: stampedPilotRefs.append(pRef + ":::" + pilotDict[pRef]['PilotStamp']) else: stampedPilotRefs = list(pilotRefs) break result = ce.isProxyValid() if not result['OK']: result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 600) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy(self.proxy, 500) result = ce.getJobStatus(stampedPilotRefs) if not result['OK']: self.log.error('Failed to get pilots status from CE', '%s: %s' % (ceName, result['Message'])) continue pilotCEDict = result['Value'] for pRef in pilotRefs: newStatus = '' oldStatus = pilotDict[pRef]['Status'] ceStatus = pilotCEDict[pRef] if oldStatus == ceStatus: # Status did not change, continue continue elif ceStatus == "Unknown" and not oldStatus in FINAL_PILOT_STATUS: # Pilot finished without reporting, consider it Aborted newStatus = 'Aborted' elif ceStatus != 'Unknown': # Update the pilot status to the new value newStatus = ceStatus if newStatus: self.log.info('Updating status to %s for pilot %s' % (newStatus, pRef)) result = pilotAgentsDB.setPilotStatus( pRef, newStatus, '', 'Updated by SiteDirector') # Retrieve the pilot output now if newStatus in FINAL_PILOT_STATUS: if pilotDict[pRef]['OutputReady'].lower( ) == 'false' and self.getOutput: self.log.info('Retrieving output for pilot %s' % pRef) pilotStamp = pilotDict[pRef]['PilotStamp'] pRefStamp = pRef if pilotStamp: pRefStamp = pRef + ':::' + pilotStamp result = ce.getJobOutput(pRefStamp) if not result['OK']: self.log.error( 'Failed to get pilot output', '%s: %s' % (ceName, result['Message'])) else: output, error = result['Value'] if output: result = pilotAgentsDB.storePilotOutput( pRef, output, error) if not result['OK']: self.log.error( 'Failed to store pilot output', result['Message']) else: self.log.warn( 'Empty pilot output not stored to PilotDB') # The pilot can be in Done state set by the job agent check if the output is retrieved for queue in self.queueDict: ce = self.queueDict[queue]['CE'] if not ce.isProxyValid(120): result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 1000) if not result['OK']: return result ce.setProxy(self.proxy, 940) ceName = self.queueDict[queue]['CEName'] queueName = self.queueDict[queue]['QueueName'] ceType = self.queueDict[queue]['CEType'] siteName = self.queueDict[queue]['Site'] result = pilotAgentsDB.selectPilots({ 'DestinationSite': ceName, 'Queue': queueName, 'GridType': ceType, 'GridSite': siteName, 'OutputReady': 'False', 'Status': FINAL_PILOT_STATUS }) if not result['OK']: self.log.error('Failed to select pilots', result['Message']) continue pilotRefs = result['Value'] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo(pilotRefs) if not result['OK']: self.log.error('Failed to get pilots info from DB', result['Message']) continue pilotDict = result['Value'] if self.getOutput: for pRef in pilotRefs: self.log.info('Retrieving output for pilot %s' % pRef) pilotStamp = pilotDict[pRef]['PilotStamp'] pRefStamp = pRef if pilotStamp: pRefStamp = pRef + ':::' + pilotStamp result = ce.getJobOutput(pRefStamp) if not result['OK']: self.log.error('Failed to get pilot output', '%s: %s' % (ceName, result['Message'])) else: output, error = result['Value'] result = pilotAgentsDB.storePilotOutput( pRef, output, error) if not result['OK']: self.log.error('Failed to store pilot output', result['Message']) # Check if the accounting is to be sent if self.sendAccounting: result = pilotAgentsDB.selectPilots({ 'DestinationSite': ceName, 'Queue': queueName, 'GridType': ceType, 'GridSite': siteName, 'AccountingSent': 'False', 'Status': FINAL_PILOT_STATUS }) if not result['OK']: self.log.error('Failed to select pilots', result['Message']) continue pilotRefs = result['Value'] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo(pilotRefs) if not result['OK']: self.log.error('Failed to get pilots info from DB', result['Message']) continue pilotDict = result['Value'] result = self.sendPilotAccounting(pilotDict) if not result['OK']: self.log.error('Failed to send pilot agent accounting') return S_OK() def sendPilotAccounting(self, pilotDict): """ Send pilot accounting record """ for pRef in pilotDict: self.log.verbose('Preparing accounting record for pilot %s' % pRef) pA = PilotAccounting() pA.setEndTime(pilotDict[pRef]['LastUpdateTime']) pA.setStartTime(pilotDict[pRef]['SubmissionTime']) retVal = CS.getUsernameForDN(pilotDict[pRef]['OwnerDN']) if not retVal['OK']: userName = '******' self.log.error("Can't determine username for dn:", pilotDict[pRef]['OwnerDN']) else: userName = retVal['Value'] pA.setValueByKey('User', userName) pA.setValueByKey('UserGroup', pilotDict[pRef]['OwnerGroup']) result = getSiteForCE(pilotDict[pRef]['DestinationSite']) if result['OK'] and result['Value'].strip(): pA.setValueByKey('Site', result['Value'].strip()) else: pA.setValueByKey('Site', 'Unknown') pA.setValueByKey('GridCE', pilotDict[pRef]['DestinationSite']) pA.setValueByKey('GridMiddleware', pilotDict[pRef]['GridType']) pA.setValueByKey('GridResourceBroker', pilotDict[pRef]['Broker']) pA.setValueByKey('GridStatus', pilotDict[pRef]['Status']) if not 'Jobs' in pilotDict[pRef]: pA.setValueByKey('Jobs', 0) else: pA.setValueByKey('Jobs', len(pilotDict[pRef]['Jobs'])) self.log.info("Adding accounting record for pilot %s" % pilotDict[pRef]['PilotID']) retVal = gDataStoreClient.addRegister(pA) if not retVal['OK']: self.log.error('Failed to send accounting info for pilot ', pRef) else: # Set up AccountingSent flag result = pilotAgentsDB.setAccountingFlag(pRef) if not result['OK']: self.log.error('Failed to set accounting flag for pilot ', pRef) self.log.info('Committing accounting records for %d pilots' % len(pilotDict)) result = gDataStoreClient.commit() if result['OK']: for pRef in pilotDict: self.log.verbose('Setting AccountingSent flag for pilot %s' % pRef) result = pilotAgentsDB.setAccountingFlag(pRef) if not result['OK']: self.log.error('Failed to set accounting flag for pilot ', pRef) else: return result return S_OK()
def _resolveCECandidates( self, taskQueueDict ): """ Return a list of CEs for this TaskQueue """ # assume user knows what they're doing and avoid site mask e.g. sam jobs if 'GridCEs' in taskQueueDict and taskQueueDict['GridCEs']: self.log.info( 'CEs requested by TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join( taskQueueDict['GridCEs'] ) ) return taskQueueDict['GridCEs'] # Get the mask siteStatus = SiteStatus() ret = siteStatus.getUsableSites( 'ComputingAccess' ) if not ret['OK']: self.log.error( 'Can not retrieve site Mask from DB:', ret['Message'] ) return [] usableSites = ret['Value'] if not usableSites: self.log.error( 'Site mask is empty' ) return [] self.log.verbose( 'Site Mask: %s' % ', '.join( usableSites ) ) # remove banned sites from siteMask if 'BannedSites' in taskQueueDict: for site in taskQueueDict['BannedSites']: if site in usableSites: usableSites.remove( site ) self.log.verbose( 'Removing banned site %s from site Mask' % site ) # remove from the mask if a Site is given siteMask = [ site for site in usableSites if 'Sites' not in taskQueueDict or site in taskQueueDict['Sites'] ] if not siteMask: # pilot can not be submitted self.log.info( 'No Valid Site Candidate in Mask for TaskQueue %s' % taskQueueDict['TaskQueueID'] ) return [] self.log.info( 'Site Candidates for TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join( siteMask ) ) # Get CE's associates to the given site Names ceMask = [] resources = Resources( vo = self.virtualOrganization ) result = resources.getEligibleResources( 'Computing', {'Site':siteMask, 'SubmissionMode':'gLite', 'CEType':['LCG','CREAM']} ) if not result['OK']: self.log.error( "Failed to get eligible ce's:", result['Message'] ) return [] ces = result['Value'] for ce in ces: ceHost = resources.getComputingElementValue( ce, 'Host', 'unknown' ) if ceHost != 'unknown': ceMask.append( ceHost ) if not ceMask: self.log.info( 'No CE Candidate found for TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join( siteMask ) ) self.log.verbose( 'CE Candidates for TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join( ceMask ) ) return ceMask
def checkJob( self, job, classAdJob ): """This method controls the checking of the job. """ self.log.verbose( 'Job %s will be processed' % ( job ) ) # Check if the job was recently rescheduled result = self.jobDB.getJobAttributes( job, ['RescheduleCounter', 'RescheduleTime', 'ApplicationStatus'] ) if not result['OK']: self.log.error( result['Message'] ) return S_ERROR( 'Can not get job attributes from JobDB' ) jobDict = result['Value'] reCounter = int( jobDict['RescheduleCounter'] ) if reCounter != 0 : reTime = fromString( jobDict['RescheduleTime'] ) delta = toEpoch() - toEpoch( reTime ) delay = self.maxRescheduleDelay if reCounter <= len( self.rescheduleDelaysList ): delay = self.rescheduleDelaysList[reCounter - 1] if delta < delay: if jobDict['ApplicationStatus'].find( 'On Hold: after rescheduling' ) == -1: result = self.jobDB.setJobStatus( job, application = 'On Hold: after rescheduling #%d' % reCounter ) return S_OK() # First, get Site and BannedSites from the Job result = self.__getJobSiteRequirement( job, classAdJob ) userBannedSites = result['BannedSites'] userSites = result['Sites'] if userSites: userSites = applySiteRequirements( userSites, [], userBannedSites ) if not userSites: msg = 'Impossible Site Requirement' return S_ERROR( msg ) # Second, get the Active and Banned sites from the RSS siteStatus = SiteStatus() usableSites = siteStatus.getUsableSites( 'ComputingAccess' ) unusableSites = siteStatus.getUnusableSites( 'ComputingAccess' ) if not ( usableSites['OK'] and unusableSites['OK'] ): if not usableSites['OK']: self.log.error( usableSites['Message'] ) if not unusableSites['OK']: self.log.error( unusableSites['Message'] ) return S_ERROR( 'Can not get Active and Banned Sites from JobDB' ) usableSites = usableSites['Value'] unusableSites = unusableSites['Value'] if userSites: sites = applySiteRequirements( userSites, usableSites, unusableSites ) if not sites: # Put on Hold only non-excluded job types jobType = classAdJob.getAttributeString( 'JobType' ) if not jobType in self.excludedOnHoldJobTypes: msg = 'On Hold: Requested site is Banned or not Active' self.log.info( msg ) result = self.jobDB.setJobStatus( job, application = msg ) return S_OK() # Third, check if there is input data result = self.jobDB.getInputData( job ) if not result['OK']: self.log.warn( 'Failed to get input data from JobDB for %s' % ( job ) ) self.log.error( result['Message'] ) return S_ERROR( 'Failed to get input data from JobDB' ) if not result['Value']: return self.__sendJobToTaskQueue( job, classAdJob, userSites, userBannedSites ) hasInputData = False inputData = [] for lfn in result['Value']: if lfn: inputData.append( lfn ) hasInputData = True if not hasInputData: #With no input data requirement, job can proceed directly to task queue self.log.verbose( 'Job %s has no input data requirement' % ( job ) ) return self.__sendJobToTaskQueue( job, classAdJob, userSites, userBannedSites ) self.log.verbose( 'Job %s has an input data requirement ' % ( job ) ) # Fourth, Check all optimizer information result = self.__checkOptimizerInfo( job ) if not result['OK']: return result optInfo = result['Value'] #Compare site candidates with current mask optSites = optInfo['SiteCandidates'].keys() self.log.info( 'Input Data Site Candidates: %s' % ( ', '.join( optSites ) ) ) # Check that it is compatible with user requirements optSites = applySiteRequirements( optSites, userSites, userBannedSites ) if not optSites: msg = 'Impossible Site + InputData Requirement' return S_ERROR( msg ) sites = applySiteRequirements( optSites, usableSites, unusableSites ) if not sites: msg = 'On Hold: InputData Site is Banned or not Active' self.log.info( msg ) result = self.jobDB.setJobStatus( job, application = msg ) return S_OK() #Set stager request as necessary, optimize for smallest #files on tape if #more than one site candidate left at this point checkStaging = self.__resolveSitesForStaging( job, sites, inputData, optInfo['SiteCandidates'] ) if not checkStaging['OK']: return checkStaging destinationSites = checkStaging['SiteCandidates'] if not destinationSites: return S_ERROR( 'No destination sites available' ) stagingFlag = checkStaging['Value'] if stagingFlag: #Single site candidate chosen and staging required self.log.verbose( 'Job %s requires staging of input data' % ( job ) ) # set all LFN to disk for the selected site stagingSite = destinationSites[0] siteDict = optInfo['SiteCandidates'][stagingSite] siteDict['disk'] = siteDict['disk'] + siteDict['tape'] siteDict['tape'] = 0 optInfo['SiteCandidates'][stagingSite] = siteDict self.log.verbose( 'Updating %s Optimizer Info for Job %s:' % ( self.dataAgentName, job ), optInfo ) result = self.setOptimizerJobInfo( job, self.dataAgentName, optInfo ) if not result['OK']: return result # Site is selected for staging, report it self.log.verbose( 'Staging site candidate for job %s is %s' % ( job, stagingSite ) ) result = self.__getStagingSites( stagingSite, destinationSites ) if not result['OK']: stagingSites = [stagingSite] else: stagingSites = result['Value'] if len( stagingSites ) == 1: self.jobDB.setJobAttribute( job, 'Site', stagingSite ) else: # Get the name of the site group result = self.__getSiteGroup( stagingSites ) if result['OK']: groupName = result['Value'] if groupName: self.jobDB.setJobAttribute( job, 'Site', groupName ) else: self.jobDB.setJobAttribute( job, 'Site', 'Multiple' ) else: self.jobDB.setJobAttribute( job, 'Site', 'Multiple' ) stagerDict = self.__setStagingRequest( job, stagingSite, optInfo ) if not stagerDict['OK']: return stagerDict self.__updateOtherSites( job, stagingSite, stagerDict['Value'], optInfo ) return S_OK() else: #No staging required, can proceed to task queue agent and then waiting status self.log.verbose( 'Job %s does not require staging of input data' % ( job ) ) #Finally send job to TaskQueueAgent return self.__sendJobToTaskQueue( job, classAdJob, destinationSites, userBannedSites )
class DiracAdmin(API): """ Administrative functionalities """ ############################################################################# def __init__(self): """Internal initialization of the DIRAC Admin API. """ super(DiracAdmin, self).__init__() self.csAPI = CSAPI() self.dbg = False if gConfig.getValue(self.section + '/LogLevel', 'DEBUG') == 'DEBUG': self.dbg = True self.scratchDir = gConfig.getValue(self.section + '/ScratchDir', '/tmp') self.currentDir = os.getcwd() self.rssFlag = ResourceStatus().rssFlag self.sitestatus = SiteStatus() ############################################################################# def uploadProxy(self, group): """Upload a proxy to the DIRAC WMS. This method Example usage: >>> print diracAdmin.uploadProxy('lhcb_pilot') {'OK': True, 'Value': 0L} :param group: DIRAC Group :type job: string :return: S_OK,S_ERROR :param permanent: Indefinitely update proxy :type permanent: boolean """ return gProxyManager.uploadProxy(diracGroup=group) ############################################################################# def setProxyPersistency(self, userDN, userGroup, persistent=True): """Set the persistence of a proxy in the Proxy Manager Example usage: >>> print diracAdmin.setProxyPersistency( 'some DN', 'dirac group', True ) {'OK': True } :param userDN: User DN :type userDN: string :param userGroup: DIRAC Group :type userGroup: string :param persistent: Persistent flag :type persistent: boolean :return: S_OK,S_ERROR """ return gProxyManager.setPersistency(userDN, userGroup, persistent) ############################################################################# def checkProxyUploaded(self, userDN, userGroup, requiredTime): """Set the persistence of a proxy in the Proxy Manager Example usage: >>> print diracAdmin.setProxyPersistency( 'some DN', 'dirac group', True ) {'OK': True, 'Value' : True/False } :param userDN: User DN :type userDN: string :param userGroup: DIRAC Group :type userGroup: string :param requiredTime: Required life time of the uploaded proxy :type requiredTime: boolean :return: S_OK,S_ERROR """ return gProxyManager.userHasProxy(userDN, userGroup, requiredTime) ############################################################################# def getSiteMask(self, printOutput=False, status='Active'): """Retrieve current site mask from WMS Administrator service. Example usage: >>> print diracAdmin.getSiteMask() {'OK': True, 'Value': 0L} :return: S_OK,S_ERROR """ result = self.sitestatus.getSites(siteState=status) if result['OK']: sites = result['Value'] if printOutput: sites.sort() for site in sites: print site return result ############################################################################# def getBannedSites(self, gridType=[], printOutput=False): """Retrieve current list of banned and probing sites. Example usage: >>> print diracAdmin.getBannedSites() {'OK': True, 'Value': []} :return: S_OK,S_ERROR """ bannedSites = self.sitestatus.getSites(siteState='Banned') if not bannedSites['OK']: return bannedSites probingSites = self.sitestatus.getSites(siteState='Probing') if not probingSites['OK']: return probingSites mergedList = bannedSites['Value'] + probingSites['Value'] mergedList.sort() if printOutput: print '\n'.join(mergedList) return S_OK(mergedList) ############################################################################# def getSiteSection(self, site, printOutput=False): """Simple utility to get the list of CEs for DIRAC site name. Example usage: >>> print diracAdmin.getSiteSection('LCG.CERN.ch') {'OK': True, 'Value':} :return: S_OK,S_ERROR """ gridType = site.split('.')[0] if not gConfig.getSections('/Resources/Sites/%s' % (gridType))['OK']: return S_ERROR('/Resources/Sites/%s is not a valid site section' % (gridType)) result = gConfig.getOptionsDict('/Resources/Sites/%s/%s' % (gridType, site)) if printOutput and result['OK']: print self.pPrint.pformat(result['Value']) return result ############################################################################# def allowSite(self, site, comment, printOutput=False): """Adds the site to the site mask. Example usage: >>> print diracAdmin.allowSite() {'OK': True, 'Value': } :return: S_OK,S_ERROR """ result = self.__checkSiteIsValid(site) if not result['OK']: return result result = self.getSiteMask(status='Active') if not result['OK']: return result siteMask = result['Value'] if site in siteMask: if printOutput: print 'Site %s is already Active' % site return S_OK('Site %s is already Active' % site) if self.rssFlag: result = self.sitestatus.setSiteStatus(site, 'Active', comment) else: wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.allowSite(site, comment) if not result['OK']: return result if printOutput: print 'Site %s status is set to Active' % site return result ############################################################################# def getSiteMaskLogging(self, site=None, printOutput=False): """Retrieves site mask logging information. Example usage: >>> print diracAdmin.getSiteMaskLogging('LCG.AUVER.fr') {'OK': True, 'Value': } :return: S_OK,S_ERROR """ result = self.__checkSiteIsValid(site) if not result['OK']: return result wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.getSiteMaskLogging(site) if not result['OK']: return result if site: if not result['Value'].has_key(site): return S_ERROR('Site mask information not available for %s' % (site)) if printOutput: if site: print '\nSite Mask Logging Info for %s\n' % site else: print '\nAll Site Mask Logging Info\n' siteDict = result['Value'] for site, tupleList in siteDict.iteritems(): if not site: print '\n===> %s\n' % site for tup in tupleList: print str( tup[0] ).ljust( 8 ) + str( tup[1] ).ljust( 20 ) + \ '( ' + str( tup[2] ).ljust( len( str( tup[2] ) ) ) + ' ) "' + str( tup[3] ) + '"' print ' ' return result ############################################################################# def banSite(self, site, comment, printOutput=False): """Removes the site from the site mask. Example usage: >>> print diracAdmin.banSite() {'OK': True, 'Value': } :return: S_OK,S_ERROR """ result = self.__checkSiteIsValid(site) if not result['OK']: return result mask = self.getSiteMask(status='Banned') if not mask['OK']: return mask siteMask = mask['Value'] if site in siteMask: if printOutput: print 'Site %s is already Banned' % site return S_OK('Site %s is already Banned' % site) if self.rssFlag: result = self.sitestatus.setSiteStatus(site, 'Banned', comment) else: wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.banSite(site, comment) if not result['OK']: return result if printOutput: print 'Site %s status is set to Banned' % site return result ############################################################################# def __checkSiteIsValid(self, site): """Internal function to check that a site name is valid. """ sites = getSiteCEMapping() if not sites['OK']: return S_ERROR('Could not get site CE mapping') siteList = sites['Value'].keys() if not site in siteList: return S_ERROR( 'Specified site %s is not in list of defined sites' % site) return S_OK('%s is valid' % site) ############################################################################# def clearMask(self): """Removes all sites from the site mask. Should be used with care. Example usage: >>> print diracAdmin.clearMask() {'OK': True, 'Value':''} :return: S_OK,S_ERROR """ wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.clearMask() return result ############################################################################# def getServicePorts(self, setup='', printOutput=False): """Checks the service ports for the specified setup. If not given this is taken from the current installation (/DIRAC/Setup) Example usage: >>> print diracAdmin.getServicePorts() {'OK': True, 'Value':''} :return: S_OK,S_ERROR """ if not setup: setup = gConfig.getValue('/DIRAC/Setup', '') setupList = gConfig.getSections('/DIRAC/Setups', []) if not setupList['OK']: return S_ERROR('Could not get /DIRAC/Setups sections') setupList = setupList['Value'] if not setup in setupList: return S_ERROR('Setup %s is not in allowed list: %s' % (setup, ', '.join(setupList))) serviceSetups = gConfig.getOptionsDict('/DIRAC/Setups/%s' % setup) if not serviceSetups['OK']: return S_ERROR('Could not get /DIRAC/Setups/%s options' % setup) serviceSetups = serviceSetups['Value'] # dict systemList = gConfig.getSections('/Systems') if not systemList['OK']: return S_ERROR('Could not get Systems sections') systemList = systemList['Value'] result = {} for system in systemList: if serviceSetups.has_key(system): path = '/Systems/%s/%s/Services' % (system, serviceSetups[system]) servicesList = gConfig.getSections(path) if not servicesList['OK']: self.log.warn('Could not get sections in %s' % path) else: servicesList = servicesList['Value'] if not servicesList: servicesList = [] self.log.verbose('System: %s ServicesList: %s' % (system, ', '.join(servicesList))) for service in servicesList: spath = '%s/%s/Port' % (path, service) servicePort = gConfig.getValue(spath, 0) if servicePort: self.log.verbose('Found port for %s/%s = %s' % (system, service, servicePort)) result['%s/%s' % (system, service)] = servicePort else: self.log.warn('No port found for %s' % spath) else: self.log.warn('%s is not defined in /DIRAC/Setups/%s' % (system, setup)) if printOutput: print self.pPrint.pformat(result) return S_OK(result) ############################################################################# def getProxy(self, userDN, userGroup, validity=43200, limited=False): """Retrieves a proxy with default 12hr validity and stores this in a file in the local directory by default. Example usage: >>> print diracAdmin.getProxy() {'OK': True, 'Value': } :return: S_OK,S_ERROR """ return gProxyManager.downloadProxy(userDN, userGroup, limited=limited, requiredTimeLeft=validity) ############################################################################# def getVOMSProxy(self, userDN, userGroup, vomsAttr=False, validity=43200, limited=False): """Retrieves a proxy with default 12hr validity and VOMS extensions and stores this in a file in the local directory by default. Example usage: >>> print diracAdmin.getVOMSProxy() {'OK': True, 'Value': } :return: S_OK,S_ERROR """ return gProxyManager.downloadVOMSProxy(userDN, userGroup, limited=limited, requiredVOMSAttribute=vomsAttr, requiredTimeLeft=validity) ############################################################################# def getPilotProxy(self, userDN, userGroup, validity=43200): """Retrieves a pilot proxy with default 12hr validity and stores this in a file in the local directory by default. Example usage: >>> print diracAdmin.getVOMSProxy() {'OK': True, 'Value': } :return: S_OK,S_ERROR """ return gProxyManager.getPilotProxyFromDIRACGroup( userDN, userGroup, requiredTimeLeft=validity) ############################################################################# def resetJob(self, jobID): """Reset a job or list of jobs in the WMS. This operation resets the reschedule counter for a job or list of jobs and allows them to run as new. Example:: >>> print dirac.reset(12345) {'OK': True, 'Value': [12345]} :param job: JobID :type job: integer or list of integers :return: S_OK,S_ERROR """ if isinstance(jobID, basestring): try: jobID = int(jobID) except Exception as x: return self._errorReport( str(x), 'Expected integer or convertible integer for existing jobID' ) elif isinstance(jobID, list): try: jobID = [int(job) for job in jobID] except Exception as x: return self._errorReport( str(x), 'Expected integer or convertible integer for existing jobIDs' ) jobManager = RPCClient('WorkloadManagement/JobManager', useCertificates=False) result = jobManager.resetJob(jobID) return result ############################################################################# def getJobPilotOutput(self, jobID, directory=''): """Retrieve the pilot output for an existing job in the WMS. The output will be retrieved in a local directory unless otherwise specified. >>> print dirac.getJobPilotOutput(12345) {'OK': True, StdOut:'',StdError:''} :param job: JobID :type job: integer or string :return: S_OK,S_ERROR """ if not directory: directory = self.currentDir if not os.path.exists(directory): return self._errorReport('Directory %s does not exist' % directory) wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.getJobPilotOutput(jobID) if not result['OK']: return result outputPath = '%s/pilot_%s' % (directory, jobID) if os.path.exists(outputPath): self.log.info('Remove %s and retry to continue' % outputPath) return S_ERROR('Remove %s and retry to continue' % outputPath) if not os.path.exists(outputPath): self.log.verbose('Creating directory %s' % outputPath) os.mkdir(outputPath) outputs = result['Value'] if outputs.has_key('StdOut'): stdout = '%s/std.out' % (outputPath) with open(stdout, 'w') as fopen: fopen.write(outputs['StdOut']) self.log.verbose('Standard output written to %s' % (stdout)) else: self.log.warn('No standard output returned') if outputs.has_key('StdError'): stderr = '%s/std.err' % (outputPath) with open(stderr, 'w') as fopen: fopen.write(outputs['StdError']) self.log.verbose('Standard error written to %s' % (stderr)) else: self.log.warn('No standard error returned') self.log.always('Outputs retrieved in %s' % outputPath) return result ############################################################################# def getPilotOutput(self, gridReference, directory=''): """Retrieve the pilot output (std.out and std.err) for an existing job in the WMS. >>> print dirac.getJobPilotOutput(12345) {'OK': True, 'Value': {}} :param job: JobID :type job: integer or string :return: S_OK,S_ERROR """ if not isinstance(gridReference, basestring): return self._errorReport('Expected string for pilot reference') if not directory: directory = self.currentDir if not os.path.exists(directory): return self._errorReport('Directory %s does not exist' % directory) wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.getPilotOutput(gridReference) if not result['OK']: return result gridReferenceSmall = gridReference.split('/')[-1] if not gridReferenceSmall: gridReferenceSmall = 'reference' outputPath = '%s/pilot_%s' % (directory, gridReferenceSmall) if os.path.exists(outputPath): self.log.info('Remove %s and retry to continue' % outputPath) return S_ERROR('Remove %s and retry to continue' % outputPath) if not os.path.exists(outputPath): self.log.verbose('Creating directory %s' % outputPath) os.mkdir(outputPath) outputs = result['Value'] if outputs.has_key('StdOut'): stdout = '%s/std.out' % (outputPath) with open(stdout, 'w') as fopen: fopen.write(outputs['StdOut']) self.log.info('Standard output written to %s' % (stdout)) else: self.log.warn('No standard output returned') if outputs.has_key('StdErr'): stderr = '%s/std.err' % (outputPath) with open(stderr, 'w') as fopen: fopen.write(outputs['StdErr']) self.log.info('Standard error written to %s' % (stderr)) else: self.log.warn('No standard error returned') self.log.always('Outputs retrieved in %s' % outputPath) return result ############################################################################# def getPilotInfo(self, gridReference): """Retrieve info relative to a pilot reference >>> print dirac.getPilotInfo(12345) {'OK': True, 'Value': {}} :param gridReference: Pilot Job Reference :type gridReference: string :return: S_OK,S_ERROR """ if not isinstance(gridReference, basestring): return self._errorReport('Expected string for pilot reference') wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.getPilotInfo(gridReference) return result ############################################################################# def killPilot(self, gridReference): """Kill the pilot specified >>> print dirac.getPilotInfo(12345) {'OK': True, 'Value': {}} :param gridReference: Pilot Job Reference :return: S_OK,S_ERROR """ if not isinstance(gridReference, basestring): return self._errorReport('Expected string for pilot reference') wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.killPilot(gridReference) return result ############################################################################# def getPilotLoggingInfo(self, gridReference): """Retrieve the pilot logging info for an existing job in the WMS. >>> print dirac.getPilotLoggingInfo(12345) {'OK': True, 'Value': {"The output of the command"}} :param gridReference: Gridp pilot job reference Id :type gridReference: string :return: S_OK,S_ERROR """ if type(gridReference) not in types.StringTypes: return self._errorReport('Expected string for pilot reference') wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') return wmsAdmin.getPilotLoggingInfo(gridReference) ############################################################################# def getJobPilots(self, jobID): """Extract the list of submitted pilots and their status for a given jobID from the WMS. Useful information is printed to the screen. >>> print dirac.getJobPilots() {'OK': True, 'Value': {PilotID:{StatusDict}}} :param job: JobID :type job: integer or string :return: S_OK,S_ERROR """ if isinstance(jobID, basestring): try: jobID = int(jobID) except Exception as x: return self._errorReport( str(x), 'Expected integer or string for existing jobID') wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.getPilots(jobID) if result['OK']: print self.pPrint.pformat(result['Value']) return result ############################################################################# def getPilotSummary(self, startDate='', endDate=''): """Retrieve the pilot output for an existing job in the WMS. Summary is printed at INFO level, full dictionary of results also returned. >>> print dirac.getPilotSummary() {'OK': True, 'Value': {CE:{Status:Count}}} :param job: JobID :type job: integer or string :return: S_OK,S_ERROR """ wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.getPilotSummary(startDate, endDate) if not result['OK']: return result ceDict = result['Value'] headers = 'CE'.ljust(28) i = 0 for ce, summary in ceDict.iteritems(): states = summary.keys() if len(states) > i: i = len(states) for i in xrange(i): headers += 'Status'.ljust(12) + 'Count'.ljust(12) print headers for ce, summary in ceDict.iteritems(): line = ce.ljust(28) states = summary.keys() states.sort() for state in states: count = str(summary[state]) line += state.ljust(12) + count.ljust(12) print line return result ############################################################################# def selectRequests(self, jobID=None, requestID=None, requestName=None, requestType=None, status=None, operation=None, ownerDN=None, ownerGroup=None, requestStart=0, limit=100, printOutput=False): """Select requests from the request management system. A few notes on the selection criteria: - jobID is the WMS JobID for the request (if applicable) - requestID is assigned during submission of the request - requestName is the corresponding XML file name - requestType e.g. 'transfer' - status e.g. Done - operation e.g. replicateAndRegister - requestStart e.g. the first request to consider (start from 0 by default) - limit e.g. selection limit (default 100) >>> dirac.selectRequests(jobID='4894') {'OK': True, 'Value': [[<Requests>]]} """ options = { 'RequestID': requestID, 'RequestName': requestName, 'JobID': jobID, 'OwnerDN': ownerDN, 'OwnerGroup': ownerGroup, 'RequestType': requestType, 'Status': status, 'Operation': operation } conditions = {} for key, value in options.iteritems(): if value: try: conditions[key] = str(value) except Exception as x: return self._errorReport( str(x), 'Expected string for %s field' % key) try: requestStart = int(requestStart) limit = int(limit) except Exception as x: return self._errorReport(str(x), 'Expected integer for %s field' % limit) self.log.verbose('Will select requests with the following conditions') self.log.verbose(self.pPrint.pformat(conditions)) requestClient = RPCClient("RequestManagement/centralURL") result = requestClient.getRequestSummaryWeb(conditions, [], requestStart, limit) if not result['OK']: self.log.warn(result['Message']) return result requestIDs = result['Value'] conds = [] for key, value in conditions.iteritems(): if value: conds.append('%s = %s' % (key, value)) self.log.verbose( '%s request(s) selected with conditions %s and limit %s' % (len(requestIDs['Records']), ', '.join(conds), limit)) if printOutput: requests = [] if len(requestIDs['Records']) > limit: requestList = requestIDs['Records'] requests = requestList[:limit] else: requests = requestIDs['Records'] print '%s request(s) selected with conditions %s and limit %s' % ( len(requestIDs['Records']), ', '.join(conds), limit) print requestIDs['ParameterNames'] for request in requests: print request if not requestIDs: return S_ERROR('No requests selected for conditions: %s' % conditions) else: return result ############################################################################# def getRequestSummary(self, printOutput=False): """ Get a summary of the requests in the request DB. """ requestClient = RPCClient("RequestManagement/centralURL", timeout=120) result = requestClient.getDBSummary() if not result['OK']: self.log.warn(result['Message']) return result if printOutput: print self.pPrint.pformat(result['Value']) return result ############################################################################# def getExternalPackageVersions(self): """ Simple function that attempts to obtain the external versions for the local DIRAC installation (frequently needed for debugging purposes). """ gLogger.info( 'DIRAC version v%dr%d build %d' % (DIRAC.majorVersion, DIRAC.minorVersion, DIRAC.patchLevel)) try: import lcg_util infoStr = 'Using lcg_util from: \n%s' % lcg_util.__file__ gLogger.info(infoStr) infoStr = "The version of lcg_utils is %s" % lcg_util.lcg_util_version( ) gLogger.info(infoStr) except Exception as x: errStr = "SRM2Storage.__init__: Failed to import lcg_util: %s" % ( x) gLogger.exception(errStr) try: import gfalthr as gfal infoStr = "Using gfalthr from: \n%s" % gfal.__file__ gLogger.info(infoStr) infoStr = "The version of gfalthr is %s" % gfal.gfal_version() gLogger.info(infoStr) except Exception as x: errStr = "SRM2Storage.__init__: Failed to import gfalthr: %s." % ( x) gLogger.warn(errStr) try: import gfal infoStr = "Using gfal from: %s" % gfal.__file__ gLogger.info(infoStr) infoStr = "The version of gfal is %s" % gfal.gfal_version() gLogger.info(infoStr) except Exception as x: errStr = "SRM2Storage.__init__: Failed to import gfal: %s" % ( x) gLogger.exception(errStr) defaultProtocols = gConfig.getValue( '/Resources/StorageElements/DefaultProtocols', []) gLogger.info('Default list of protocols are: %s' % (', '.join(defaultProtocols))) return S_OK() ############################################################################# def getSiteProtocols(self, site, printOutput=False): """ Allows to check the defined protocols for each site SE. """ result = self.__checkSiteIsValid(site) if not result['OK']: return result siteSection = '/Resources/Sites/%s/%s/SE' % (site.split('.')[0], site) siteSEs = gConfig.getValue(siteSection, []) if not siteSEs: return S_ERROR('No SEs found for site %s in section %s' % (site, siteSection)) defaultProtocols = gConfig.getValue( '/Resources/StorageElements/DefaultProtocols', []) self.log.verbose('Default list of protocols are' ', '.join(defaultProtocols)) seInfo = {} siteSEs.sort() for se in siteSEs: sections = gConfig.getSections('/Resources/StorageElements/%s/' % (se)) if not sections['OK']: return sections for section in sections['Value']: if gConfig.getValue( '/Resources/StorageElements/%s/%s/ProtocolName' % (se, section), '') == 'SRM2': path = '/Resources/StorageElements/%s/%s/ProtocolsList' % ( se, section) seProtocols = gConfig.getValue(path, []) if not seProtocols: seProtocols = defaultProtocols seInfo[se] = seProtocols if printOutput: print '\nSummary of protocols for StorageElements at site %s' % site print '\nStorageElement'.ljust(30) + 'ProtocolsList'.ljust( 30) + '\n' for se, protocols in seInfo.iteritems(): print se.ljust(30) + ', '.join(protocols).ljust(30) return S_OK(seInfo) ############################################################################# def setSiteProtocols(self, site, protocolsList, printOutput=False): """ Allows to set the defined protocols for each SE for a given site. """ result = self.__checkSiteIsValid(site) if not result['OK']: return result siteSection = '/Resources/Sites/%s/%s/SE' % (site.split('.')[0], site) siteSEs = gConfig.getValue(siteSection, []) if not siteSEs: return S_ERROR('No SEs found for site %s in section %s' % (site, siteSection)) defaultProtocols = gConfig.getValue( '/Resources/StorageElements/DefaultProtocols', []) self.log.verbose('Default list of protocols are', ', '.join(defaultProtocols)) for protocol in protocolsList: if not protocol in defaultProtocols: return S_ERROR( 'Requested to set protocol %s in list but %s is not ' 'in default list of protocols:\n%s' % (protocol, protocol, ', '.join(defaultProtocols))) modifiedCS = False result = promptUser( 'Do you want to add the following default protocols:' ' %s for SE(s):\n%s' % (', '.join(protocolsList), ', '.join(siteSEs))) if not result['OK']: return result if result['Value'].lower() != 'y': self.log.always('No protocols will be added') return S_OK() for se in siteSEs: sections = gConfig.getSections('/Resources/StorageElements/%s/' % (se)) if not sections['OK']: return sections for section in sections['Value']: if gConfig.getValue( '/Resources/StorageElements/%s/%s/ProtocolName' % (se, section), '') == 'SRM2': path = '/Resources/StorageElements/%s/%s/ProtocolsList' % ( se, section) self.log.verbose('Setting %s to %s' % (path, ', '.join(protocolsList))) result = self.csSetOption(path, ', '.join(protocolsList)) if not result['OK']: return result modifiedCS = True if modifiedCS: result = self.csCommitChanges(False) if not result['OK']: return S_ERROR('CS Commit failed with message = %s' % (result['Message'])) else: if printOutput: print 'Successfully committed changes to CS' else: if printOutput: print 'No modifications to CS required' return S_OK() ############################################################################# def csSetOption(self, optionPath, optionValue): """ Function to modify an existing value in the CS. """ return self.csAPI.setOption(optionPath, optionValue) ############################################################################# def csSetOptionComment(self, optionPath, comment): """ Function to modify an existing value in the CS. """ return self.csAPI.setOptionComment(optionPath, comment) ############################################################################# def csModifyValue(self, optionPath, newValue): """ Function to modify an existing value in the CS. """ return self.csAPI.modifyValue(optionPath, newValue) ############################################################################# def csRegisterUser(self, username, properties): """ Registers a user in the CS. - username: Username of the user (easy;) - properties: Dict containing: - DN - groups : list/tuple of groups the user belongs to - <others> : More properties of the user, like mail """ return self.csAPI.addUser(username, properties) ############################################################################# def csDeleteUser(self, user): """ Deletes a user from the CS. Can take a list of users """ return self.csAPI.deleteUsers(user) ############################################################################# def csModifyUser(self, username, properties, createIfNonExistant=False): """ Modify a user in the CS. Takes the same params as in addUser and applies the changes """ return self.csAPI.modifyUser(username, properties, createIfNonExistant) ############################################################################# def csListUsers(self, group=False): """ Lists the users in the CS. If no group is specified return all users. """ return self.csAPI.listUsers(group) ############################################################################# def csDescribeUsers(self, mask=False): """ List users and their properties in the CS. If a mask is given, only users in the mask will be returned """ return self.csAPI.describeUsers(mask) ############################################################################# def csModifyGroup(self, groupname, properties, createIfNonExistant=False): """ Modify a user in the CS. Takes the same params as in addGroup and applies the changes """ return self.csAPI.modifyGroup(groupname, properties, createIfNonExistant) ############################################################################# def csListHosts(self): """ Lists the hosts in the CS """ return self.csAPI.listHosts() ############################################################################# def csDescribeHosts(self, mask=False): """ Gets extended info for the hosts in the CS """ return self.csAPI.describeHosts(mask) ############################################################################# def csModifyHost(self, hostname, properties, createIfNonExistant=False): """ Modify a host in the CS. Takes the same params as in addHost and applies the changes """ return self.csAPI.modifyHost(hostname, properties, createIfNonExistant) ############################################################################# def csListGroups(self): """ Lists groups in the CS """ return self.csAPI.listGroups() ############################################################################# def csDescribeGroups(self, mask=False): """ List groups and their properties in the CS. If a mask is given, only groups in the mask will be returned """ return self.csAPI.describeGroups(mask) ############################################################################# def csSyncUsersWithCFG(self, usersCFG): """ Synchronize users in cfg with its contents """ return self.csAPI.syncUsersWithCFG(usersCFG) ############################################################################# def csCommitChanges(self, sortUsers=True): """ Commit the changes in the CS """ return self.csAPI.commitChanges(sortUsers=False) ############################################################################# def sendMail(self, address, subject, body, fromAddress=None, localAttempt=True, html=False): """ Send mail to specified address with body. """ notification = NotificationClient() return notification.sendMail(address, subject, body, fromAddress, localAttempt, html) ############################################################################# def sendSMS(self, userName, body, fromAddress=None): """ Send mail to specified address with body. """ if len(body) > 160: return S_ERROR('Exceeded maximum SMS length of 160 characters') notification = NotificationClient() return notification.sendSMS(userName, body, fromAddress) ############################################################################# def getBDIISite(self, site, host=None): """ Get information about site from BDII at host """ return ldapSite(site, host=host) ############################################################################# def getBDIICluster(self, ce, host=None): """ Get information about ce from BDII at host """ return ldapCluster(ce, host=host) ############################################################################# def getBDIICE(self, ce, host=None): """ Get information about ce from BDII at host """ return ldapCE(ce, host=host) ############################################################################# def getBDIIService(self, ce, host=None): """ Get information about ce from BDII at host """ return ldapService(ce, host=host) ############################################################################# def getBDIICEState(self, ce, useVO=voName, host=None): """ Get information about ce state from BDII at host """ return ldapCEState(ce, useVO, host=host) ############################################################################# def getBDIICEVOView(self, ce, useVO=voName, host=None): """ Get information about ce voview from BDII at host """ return ldapCEVOView(ce, useVO, host=host) ############################################################################# def getBDIISE(self, site, useVO=voName, host=None): """ Get information about SA from BDII at host """ return ldapSE(site, useVO, host=host)
DIRACExit(-1) voName = result['Value'] resultQueues = Resources.getQueues(siteList=sites, community=voName) if not resultQueues['OK']: gLogger.error('Failed to get CE information') DIRACExit(-1) siteDict = resultQueues['Value'] result = getQueuesResolved(siteDict) if not resultQueues['OK']: gLogger.error('Failed to get CE information') DIRACExit(-1) queueDict = result['Value'] # get list of usable sites within this cycle resultMask = SiteStatus().getUsableSites() if not resultMask['OK']: gLogger.error('Failed to get Site mask information') DIRACExit(-1) siteMaskList = resultMask.get('Value', []) rssClient = ResourceStatus() fields = ('Site', 'CE', 'Queue', 'Status', 'Match', 'Reason') records = [] for queue, queueInfo in queueDict.iteritems(): site = queueInfo['Site'] ce = queueInfo['CEName'] siteStatus = "Active" if site in siteMaskList else "InActive" ceStatus = siteStatus
class DiracAdmin(API): """ Administrative functionalities """ ############################################################################# def __init__(self): """Internal initialization of the DIRAC Admin API. """ super(DiracAdmin, self).__init__() self.csAPI = CSAPI() self.dbg = False if gConfig.getValue(self.section + '/LogLevel', 'DEBUG') == 'DEBUG': self.dbg = True self.scratchDir = gConfig.getValue(self.section + '/ScratchDir', '/tmp') self.currentDir = os.getcwd() self.rssFlag = ResourceStatus().rssFlag self.sitestatus = SiteStatus() ############################################################################# def uploadProxy(self, group): """Upload a proxy to the DIRAC WMS. This method Example usage: >>> print diracAdmin.uploadProxy('lhcb_pilot') {'OK': True, 'Value': 0L} :param group: DIRAC Group :type job: string :return: S_OK,S_ERROR :param permanent: Indefinitely update proxy :type permanent: boolean """ return gProxyManager.uploadProxy(diracGroup=group) ############################################################################# def setProxyPersistency(self, userDN, userGroup, persistent=True): """Set the persistence of a proxy in the Proxy Manager Example usage: >>> print diracAdmin.setProxyPersistency( 'some DN', 'dirac group', True ) {'OK': True } :param userDN: User DN :type userDN: string :param userGroup: DIRAC Group :type userGroup: string :param persistent: Persistent flag :type persistent: boolean :return: S_OK,S_ERROR """ return gProxyManager.setPersistency(userDN, userGroup, persistent) ############################################################################# def checkProxyUploaded(self, userDN, userGroup, requiredTime): """Set the persistence of a proxy in the Proxy Manager Example usage: >>> print diracAdmin.setProxyPersistency( 'some DN', 'dirac group', True ) {'OK': True, 'Value' : True/False } :param userDN: User DN :type userDN: string :param userGroup: DIRAC Group :type userGroup: string :param requiredTime: Required life time of the uploaded proxy :type requiredTime: boolean :return: S_OK,S_ERROR """ return gProxyManager.userHasProxy(userDN, userGroup, requiredTime) ############################################################################# def getSiteMask(self, printOutput=False, status='Active'): """Retrieve current site mask from WMS Administrator service. Example usage: >>> print diracAdmin.getSiteMask() {'OK': True, 'Value': 0L} :return: S_OK,S_ERROR """ result = self.sitestatus.getSites(siteState=status) if result['OK']: sites = result['Value'] if printOutput: sites.sort() for site in sites: print site return result ############################################################################# def getBannedSites(self, printOutput=False): """Retrieve current list of banned and probing sites. Example usage: >>> print diracAdmin.getBannedSites() {'OK': True, 'Value': []} :return: S_OK,S_ERROR """ bannedSites = self.sitestatus.getSites(siteState='Banned') if not bannedSites['OK']: return bannedSites probingSites = self.sitestatus.getSites(siteState='Probing') if not probingSites['OK']: return probingSites mergedList = sorted(bannedSites['Value'] + probingSites['Value']) if printOutput: print '\n'.join(mergedList) return S_OK(mergedList) ############################################################################# def getSiteSection(self, site, printOutput=False): """Simple utility to get the list of CEs for DIRAC site name. Example usage: >>> print diracAdmin.getSiteSection('LCG.CERN.ch') {'OK': True, 'Value':} :return: S_OK,S_ERROR """ gridType = site.split('.')[0] if not gConfig.getSections('/Resources/Sites/%s' % (gridType))['OK']: return S_ERROR('/Resources/Sites/%s is not a valid site section' % (gridType)) result = gConfig.getOptionsDict('/Resources/Sites/%s/%s' % (gridType, site)) if printOutput and result['OK']: print self.pPrint.pformat(result['Value']) return result ############################################################################# def allowSite(self, site, comment, printOutput=False): """Adds the site to the site mask. Example usage: >>> print diracAdmin.allowSite() {'OK': True, 'Value': } :return: S_OK,S_ERROR """ result = self.__checkSiteIsValid(site) if not result['OK']: return result result = self.getSiteMask(status='Active') if not result['OK']: return result siteMask = result['Value'] if site in siteMask: if printOutput: print 'Site %s is already Active' % site return S_OK('Site %s is already Active' % site) if self.rssFlag: result = self.sitestatus.setSiteStatus(site, 'Active', comment) else: wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.allowSite(site, comment) if not result['OK']: return result if printOutput: print 'Site %s status is set to Active' % site return result ############################################################################# def getSiteMaskLogging(self, site=None, printOutput=False): """Retrieves site mask logging information. Example usage: >>> print diracAdmin.getSiteMaskLogging('LCG.AUVER.fr') {'OK': True, 'Value': } :return: S_OK,S_ERROR """ result = self.__checkSiteIsValid(site) if not result['OK']: return result if self.rssFlag: result = ResourceStatusClient().selectStatusElement('Site', 'History', name=site) else: result = RPCClient('WorkloadManagement/WMSAdministrator').getSiteMaskLogging(site) if not result['OK']: return result if printOutput: if site: print '\nSite Mask Logging Info for %s\n' % site else: print '\nAll Site Mask Logging Info\n' sitesLogging = result['Value'] if isinstance(sitesLogging, dict): for siteName, tupleList in sitesLogging.iteritems(): if not siteName: print '\n===> %s\n' % siteName for tup in tupleList: print str(tup[0]).ljust(8) + str(tup[1]).ljust(20) + \ '( ' + str(tup[2]).ljust(len(str(tup[2]))) + ' ) "' + str(tup[3]) + '"' print ' ' elif isinstance(sitesLogging, list): result = [(sl[1], sl[3], sl[4]) for sl in sitesLogging] return result ############################################################################# def banSite(self, site, comment, printOutput=False): """Removes the site from the site mask. Example usage: >>> print diracAdmin.banSite() {'OK': True, 'Value': } :return: S_OK,S_ERROR """ result = self.__checkSiteIsValid(site) if not result['OK']: return result mask = self.getSiteMask(status='Banned') if not mask['OK']: return mask siteMask = mask['Value'] if site in siteMask: if printOutput: print 'Site %s is already Banned' % site return S_OK('Site %s is already Banned' % site) if self.rssFlag: result = self.sitestatus.setSiteStatus(site, 'Banned', comment) else: wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.banSite(site, comment) if not result['OK']: return result if printOutput: print 'Site %s status is set to Banned' % site return result ############################################################################# def __checkSiteIsValid(self, site): """Internal function to check that a site name is valid. """ sites = getSiteCEMapping() if not sites['OK']: return S_ERROR('Could not get site CE mapping') siteList = sites['Value'].keys() if site not in siteList: return S_ERROR('Specified site %s is not in list of defined sites' % site) return S_OK('%s is valid' % site) ############################################################################# def clearMask(self): """Removes all sites from the site mask. Should be used with care. Example usage: >>> print diracAdmin.clearMask() {'OK': True, 'Value':''} :return: S_OK,S_ERROR """ wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.clearMask() return result ############################################################################# def getServicePorts(self, setup='', printOutput=False): """Checks the service ports for the specified setup. If not given this is taken from the current installation (/DIRAC/Setup) Example usage: >>> print diracAdmin.getServicePorts() {'OK': True, 'Value':''} :return: S_OK,S_ERROR """ if not setup: setup = gConfig.getValue('/DIRAC/Setup', '') setupList = gConfig.getSections('/DIRAC/Setups', []) if not setupList['OK']: return S_ERROR('Could not get /DIRAC/Setups sections') setupList = setupList['Value'] if setup not in setupList: return S_ERROR('Setup %s is not in allowed list: %s' % (setup, ', '.join(setupList))) serviceSetups = gConfig.getOptionsDict('/DIRAC/Setups/%s' % setup) if not serviceSetups['OK']: return S_ERROR('Could not get /DIRAC/Setups/%s options' % setup) serviceSetups = serviceSetups['Value'] # dict systemList = gConfig.getSections('/Systems') if not systemList['OK']: return S_ERROR('Could not get Systems sections') systemList = systemList['Value'] result = {} for system in systemList: if system in serviceSetups: path = '/Systems/%s/%s/Services' % (system, serviceSetups[system]) servicesList = gConfig.getSections(path) if not servicesList['OK']: self.log.warn('Could not get sections in %s' % path) else: servicesList = servicesList['Value'] if not servicesList: servicesList = [] self.log.verbose('System: %s ServicesList: %s' % (system, ', '.join(servicesList))) for service in servicesList: spath = '%s/%s/Port' % (path, service) servicePort = gConfig.getValue(spath, 0) if servicePort: self.log.verbose('Found port for %s/%s = %s' % (system, service, servicePort)) result['%s/%s' % (system, service)] = servicePort else: self.log.warn('No port found for %s' % spath) else: self.log.warn('%s is not defined in /DIRAC/Setups/%s' % (system, setup)) if printOutput: print self.pPrint.pformat(result) return S_OK(result) ############################################################################# def getProxy(self, userDN, userGroup, validity=43200, limited=False): """Retrieves a proxy with default 12hr validity and stores this in a file in the local directory by default. Example usage: >>> print diracAdmin.getProxy() {'OK': True, 'Value': } :return: S_OK,S_ERROR """ return gProxyManager.downloadProxy(userDN, userGroup, limited=limited, requiredTimeLeft=validity) ############################################################################# def getVOMSProxy(self, userDN, userGroup, vomsAttr=False, validity=43200, limited=False): """Retrieves a proxy with default 12hr validity and VOMS extensions and stores this in a file in the local directory by default. Example usage: >>> print diracAdmin.getVOMSProxy() {'OK': True, 'Value': } :return: S_OK,S_ERROR """ return gProxyManager.downloadVOMSProxy(userDN, userGroup, limited=limited, requiredVOMSAttribute=vomsAttr, requiredTimeLeft=validity) ############################################################################# def getPilotProxy(self, userDN, userGroup, validity=43200): """Retrieves a pilot proxy with default 12hr validity and stores this in a file in the local directory by default. Example usage: >>> print diracAdmin.getVOMSProxy() {'OK': True, 'Value': } :return: S_OK,S_ERROR """ return gProxyManager.getPilotProxyFromDIRACGroup(userDN, userGroup, requiredTimeLeft=validity) ############################################################################# def resetJob(self, jobID): """Reset a job or list of jobs in the WMS. This operation resets the reschedule counter for a job or list of jobs and allows them to run as new. Example:: >>> print dirac.reset(12345) {'OK': True, 'Value': [12345]} :param job: JobID :type job: integer or list of integers :return: S_OK,S_ERROR """ if isinstance(jobID, basestring): try: jobID = int(jobID) except Exception as x: return self._errorReport(str(x), 'Expected integer or convertible integer for existing jobID') elif isinstance(jobID, list): try: jobID = [int(job) for job in jobID] except Exception as x: return self._errorReport(str(x), 'Expected integer or convertible integer for existing jobIDs') jobManager = RPCClient('WorkloadManagement/JobManager', useCertificates=False) result = jobManager.resetJob(jobID) return result ############################################################################# def getJobPilotOutput(self, jobID, directory=''): """Retrieve the pilot output for an existing job in the WMS. The output will be retrieved in a local directory unless otherwise specified. >>> print dirac.getJobPilotOutput(12345) {'OK': True, StdOut:'',StdError:''} :param job: JobID :type job: integer or string :return: S_OK,S_ERROR """ if not directory: directory = self.currentDir if not os.path.exists(directory): return self._errorReport('Directory %s does not exist' % directory) wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.getJobPilotOutput(jobID) if not result['OK']: return result outputPath = '%s/pilot_%s' % (directory, jobID) if os.path.exists(outputPath): self.log.info('Remove %s and retry to continue' % outputPath) return S_ERROR('Remove %s and retry to continue' % outputPath) if not os.path.exists(outputPath): self.log.verbose('Creating directory %s' % outputPath) os.mkdir(outputPath) outputs = result['Value'] if 'StdOut' in outputs: stdout = '%s/std.out' % (outputPath) with open(stdout, 'w') as fopen: fopen.write(outputs['StdOut']) self.log.verbose('Standard output written to %s' % (stdout)) else: self.log.warn('No standard output returned') if 'StdError' in outputs: stderr = '%s/std.err' % (outputPath) with open(stderr, 'w') as fopen: fopen.write(outputs['StdError']) self.log.verbose('Standard error written to %s' % (stderr)) else: self.log.warn('No standard error returned') self.log.always('Outputs retrieved in %s' % outputPath) return result ############################################################################# def getPilotOutput(self, gridReference, directory=''): """Retrieve the pilot output (std.out and std.err) for an existing job in the WMS. >>> print dirac.getJobPilotOutput(12345) {'OK': True, 'Value': {}} :param job: JobID :type job: integer or string :return: S_OK,S_ERROR """ if not isinstance(gridReference, basestring): return self._errorReport('Expected string for pilot reference') if not directory: directory = self.currentDir if not os.path.exists(directory): return self._errorReport('Directory %s does not exist' % directory) wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.getPilotOutput(gridReference) if not result['OK']: return result gridReferenceSmall = gridReference.split('/')[-1] if not gridReferenceSmall: gridReferenceSmall = 'reference' outputPath = '%s/pilot_%s' % (directory, gridReferenceSmall) if os.path.exists(outputPath): self.log.info('Remove %s and retry to continue' % outputPath) return S_ERROR('Remove %s and retry to continue' % outputPath) if not os.path.exists(outputPath): self.log.verbose('Creating directory %s' % outputPath) os.mkdir(outputPath) outputs = result['Value'] if 'StdOut' in outputs: stdout = '%s/std.out' % (outputPath) with open(stdout, 'w') as fopen: fopen.write(outputs['StdOut']) self.log.info('Standard output written to %s' % (stdout)) else: self.log.warn('No standard output returned') if 'StdErr' in outputs: stderr = '%s/std.err' % (outputPath) with open(stderr, 'w') as fopen: fopen.write(outputs['StdErr']) self.log.info('Standard error written to %s' % (stderr)) else: self.log.warn('No standard error returned') self.log.always('Outputs retrieved in %s' % outputPath) return result ############################################################################# def getPilotInfo(self, gridReference): """Retrieve info relative to a pilot reference >>> print dirac.getPilotInfo(12345) {'OK': True, 'Value': {}} :param gridReference: Pilot Job Reference :type gridReference: string :return: S_OK,S_ERROR """ if not isinstance(gridReference, basestring): return self._errorReport('Expected string for pilot reference') wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.getPilotInfo(gridReference) return result ############################################################################# def killPilot(self, gridReference): """Kill the pilot specified >>> print dirac.getPilotInfo(12345) {'OK': True, 'Value': {}} :param gridReference: Pilot Job Reference :return: S_OK,S_ERROR """ if not isinstance(gridReference, basestring): return self._errorReport('Expected string for pilot reference') wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.killPilot(gridReference) return result ############################################################################# def getPilotLoggingInfo(self, gridReference): """Retrieve the pilot logging info for an existing job in the WMS. >>> print dirac.getPilotLoggingInfo(12345) {'OK': True, 'Value': {"The output of the command"}} :param gridReference: Gridp pilot job reference Id :type gridReference: string :return: S_OK,S_ERROR """ if not isinstance(gridReference, basestring): return self._errorReport('Expected string for pilot reference') wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') return wmsAdmin.getPilotLoggingInfo(gridReference) ############################################################################# def getJobPilots(self, jobID): """Extract the list of submitted pilots and their status for a given jobID from the WMS. Useful information is printed to the screen. >>> print dirac.getJobPilots() {'OK': True, 'Value': {PilotID:{StatusDict}}} :param job: JobID :type job: integer or string :return: S_OK,S_ERROR """ if isinstance(jobID, basestring): try: jobID = int(jobID) except Exception as x: return self._errorReport(str(x), 'Expected integer or string for existing jobID') wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.getPilots(jobID) if result['OK']: print self.pPrint.pformat(result['Value']) return result ############################################################################# def getPilotSummary(self, startDate='', endDate=''): """Retrieve the pilot output for an existing job in the WMS. Summary is printed at INFO level, full dictionary of results also returned. >>> print dirac.getPilotSummary() {'OK': True, 'Value': {CE:{Status:Count}}} :param job: JobID :type job: integer or string :return: S_OK,S_ERROR """ wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.getPilotSummary(startDate, endDate) if not result['OK']: return result ceDict = result['Value'] headers = 'CE'.ljust(28) i = 0 for ce, summary in ceDict.iteritems(): states = summary.keys() if len(states) > i: i = len(states) for i in xrange(i): headers += 'Status'.ljust(12) + 'Count'.ljust(12) print headers for ce, summary in ceDict.iteritems(): line = ce.ljust(28) states = sorted(summary) for state in states: count = str(summary[state]) line += state.ljust(12) + count.ljust(12) print line return result ############################################################################# def selectRequests(self, jobID=None, requestID=None, requestName=None, requestType=None, status=None, operation=None, ownerDN=None, ownerGroup=None, requestStart=0, limit=100, printOutput=False): """Select requests from the request management system. A few notes on the selection criteria: - jobID is the WMS JobID for the request (if applicable) - requestID is assigned during submission of the request - requestName is the corresponding XML file name - requestType e.g. 'transfer' - status e.g. Done - operation e.g. replicateAndRegister - requestStart e.g. the first request to consider (start from 0 by default) - limit e.g. selection limit (default 100) >>> dirac.selectRequests(jobID='4894') {'OK': True, 'Value': [[<Requests>]]} """ options = {'RequestID': requestID, 'RequestName': requestName, 'JobID': jobID, 'OwnerDN': ownerDN, 'OwnerGroup': ownerGroup, 'RequestType': requestType, 'Status': status, 'Operation': operation} conditions = {} for key, value in options.iteritems(): if value: try: conditions[key] = str(value) except Exception as x: return self._errorReport(str(x), 'Expected string for %s field' % key) try: requestStart = int(requestStart) limit = int(limit) except Exception as x: return self._errorReport(str(x), 'Expected integer for %s field' % limit) self.log.verbose('Will select requests with the following conditions') self.log.verbose(self.pPrint.pformat(conditions)) requestClient = RPCClient("RequestManagement/centralURL") result = requestClient.getRequestSummaryWeb(conditions, [], requestStart, limit) if not result['OK']: self.log.warn(result['Message']) return result requestIDs = result['Value'] conds = [] for key, value in conditions.iteritems(): if value: conds.append('%s = %s' % (key, value)) self.log.verbose('%s request(s) selected with conditions %s and limit %s' % (len(requestIDs['Records']), ', '.join(conds), limit)) if printOutput: requests = [] if len(requestIDs['Records']) > limit: requestList = requestIDs['Records'] requests = requestList[:limit] else: requests = requestIDs['Records'] print '%s request(s) selected with conditions %s and limit %s' % (len(requestIDs['Records']), ', '.join(conds), limit) print requestIDs['ParameterNames'] for request in requests: print request if not requestIDs: return S_ERROR('No requests selected for conditions: %s' % conditions) else: return result ############################################################################# def getRequestSummary(self, printOutput=False): """ Get a summary of the requests in the request DB. """ requestClient = RPCClient("RequestManagement/centralURL", timeout=120) result = requestClient.getDBSummary() if not result['OK']: self.log.warn(result['Message']) return result if printOutput: print self.pPrint.pformat(result['Value']) return result ############################################################################# def getExternalPackageVersions(self): """ Simple function that attempts to obtain the external versions for the local DIRAC installation (frequently needed for debugging purposes). """ gLogger.info('DIRAC version v%dr%d build %d' % (DIRAC.majorVersion, DIRAC.minorVersion, DIRAC.patchLevel)) try: import lcg_util # pylint: disable=import-error infoStr = 'Using lcg_util from: \n%s' % lcg_util.__file__ gLogger.info(infoStr) infoStr = "The version of lcg_utils is %s" % lcg_util.lcg_util_version() gLogger.info(infoStr) except Exception as x: errStr = "SRM2Storage.__init__: Failed to import lcg_util: %s" % (x) gLogger.exception(errStr) try: import gfalthr as gfal # pylint: disable=import-error infoStr = "Using gfalthr from: \n%s" % gfal.__file__ gLogger.info(infoStr) infoStr = "The version of gfalthr is %s" % gfal.gfal_version() gLogger.info(infoStr) except Exception as x: errStr = "SRM2Storage.__init__: Failed to import gfalthr: %s." % (x) gLogger.warn(errStr) try: import gfal # pylint: disable=import-error infoStr = "Using gfal from: %s" % gfal.__file__ gLogger.info(infoStr) infoStr = "The version of gfal is %s" % gfal.gfal_version() gLogger.info(infoStr) except Exception as x: errStr = "SRM2Storage.__init__: Failed to import gfal: %s" % (x) gLogger.exception(errStr) defaultProtocols = gConfig.getValue('/Resources/StorageElements/DefaultProtocols', []) gLogger.info('Default list of protocols are: %s' % (', '.join(defaultProtocols))) return S_OK() ############################################################################# def getSiteProtocols(self, site, printOutput=False): """ Allows to check the defined protocols for each site SE. """ result = self.__checkSiteIsValid(site) if not result['OK']: return result siteSection = '/Resources/Sites/%s/%s/SE' % (site.split('.')[0], site) siteSEs = gConfig.getValue(siteSection, []) if not siteSEs: return S_ERROR('No SEs found for site %s in section %s' % (site, siteSection)) defaultProtocols = gConfig.getValue('/Resources/StorageElements/DefaultProtocols', []) self.log.verbose('Default list of protocols are' ', '.join(defaultProtocols)) seInfo = {} siteSEs.sort() for se in siteSEs: sections = gConfig.getSections('/Resources/StorageElements/%s/' % (se)) if not sections['OK']: return sections for section in sections['Value']: if gConfig.getValue('/Resources/StorageElements/%s/%s/ProtocolName' % (se, section), '') == 'SRM2': path = '/Resources/StorageElements/%s/%s/ProtocolsList' % (se, section) seProtocols = gConfig.getValue(path, []) if not seProtocols: seProtocols = defaultProtocols seInfo[se] = seProtocols if printOutput: print '\nSummary of protocols for StorageElements at site %s' % site print '\nStorageElement'.ljust(30) + 'ProtocolsList'.ljust(30) + '\n' for se, protocols in seInfo.iteritems(): print se.ljust(30) + ', '.join(protocols).ljust(30) return S_OK(seInfo) ############################################################################# def setSiteProtocols(self, site, protocolsList, printOutput=False): """ Allows to set the defined protocols for each SE for a given site. """ result = self.__checkSiteIsValid(site) if not result['OK']: return result siteSection = '/Resources/Sites/%s/%s/SE' % (site.split('.')[0], site) siteSEs = gConfig.getValue(siteSection, []) if not siteSEs: return S_ERROR('No SEs found for site %s in section %s' % (site, siteSection)) defaultProtocols = gConfig.getValue('/Resources/StorageElements/DefaultProtocols', []) self.log.verbose('Default list of protocols are', ', '.join(defaultProtocols)) for protocol in protocolsList: if protocol not in defaultProtocols: return S_ERROR('Requested to set protocol %s in list but %s is not ' 'in default list of protocols:\n%s' % (protocol, protocol, ', '.join(defaultProtocols))) modifiedCS = False result = promptUser('Do you want to add the following default protocols:' ' %s for SE(s):\n%s' % (', '.join(protocolsList), ', '.join(siteSEs))) if not result['OK']: return result if result['Value'].lower() != 'y': self.log.always('No protocols will be added') return S_OK() for se in siteSEs: sections = gConfig.getSections('/Resources/StorageElements/%s/' % (se)) if not sections['OK']: return sections for section in sections['Value']: if gConfig.getValue('/Resources/StorageElements/%s/%s/ProtocolName' % (se, section), '') == 'SRM2': path = '/Resources/StorageElements/%s/%s/ProtocolsList' % (se, section) self.log.verbose('Setting %s to %s' % (path, ', '.join(protocolsList))) result = self.csSetOption(path, ', '.join(protocolsList)) if not result['OK']: return result modifiedCS = True if modifiedCS: result = self.csCommitChanges(False) if not result['OK']: return S_ERROR('CS Commit failed with message = %s' % (result['Message'])) else: if printOutput: print 'Successfully committed changes to CS' else: if printOutput: print 'No modifications to CS required' return S_OK() ############################################################################# def csSetOption(self, optionPath, optionValue): """ Function to modify an existing value in the CS. """ return self.csAPI.setOption(optionPath, optionValue) ############################################################################# def csSetOptionComment(self, optionPath, comment): """ Function to modify an existing value in the CS. """ return self.csAPI.setOptionComment(optionPath, comment) ############################################################################# def csModifyValue(self, optionPath, newValue): """ Function to modify an existing value in the CS. """ return self.csAPI.modifyValue(optionPath, newValue) ############################################################################# def csRegisterUser(self, username, properties): """ Registers a user in the CS. - username: Username of the user (easy;) - properties: Dict containing: - DN - groups : list/tuple of groups the user belongs to - <others> : More properties of the user, like mail """ return self.csAPI.addUser(username, properties) ############################################################################# def csDeleteUser(self, user): """ Deletes a user from the CS. Can take a list of users """ return self.csAPI.deleteUsers(user) ############################################################################# def csModifyUser(self, username, properties, createIfNonExistant=False): """ Modify a user in the CS. Takes the same params as in addUser and applies the changes """ return self.csAPI.modifyUser(username, properties, createIfNonExistant) ############################################################################# def csListUsers(self, group=False): """ Lists the users in the CS. If no group is specified return all users. """ return self.csAPI.listUsers(group) ############################################################################# def csDescribeUsers(self, mask=False): """ List users and their properties in the CS. If a mask is given, only users in the mask will be returned """ return self.csAPI.describeUsers(mask) ############################################################################# def csModifyGroup(self, groupname, properties, createIfNonExistant=False): """ Modify a user in the CS. Takes the same params as in addGroup and applies the changes """ return self.csAPI.modifyGroup(groupname, properties, createIfNonExistant) ############################################################################# def csListHosts(self): """ Lists the hosts in the CS """ return self.csAPI.listHosts() ############################################################################# def csDescribeHosts(self, mask=False): """ Gets extended info for the hosts in the CS """ return self.csAPI.describeHosts(mask) ############################################################################# def csModifyHost(self, hostname, properties, createIfNonExistant=False): """ Modify a host in the CS. Takes the same params as in addHost and applies the changes """ return self.csAPI.modifyHost(hostname, properties, createIfNonExistant) ############################################################################# def csListGroups(self): """ Lists groups in the CS """ return self.csAPI.listGroups() ############################################################################# def csDescribeGroups(self, mask=False): """ List groups and their properties in the CS. If a mask is given, only groups in the mask will be returned """ return self.csAPI.describeGroups(mask) ############################################################################# def csSyncUsersWithCFG(self, usersCFG): """ Synchronize users in cfg with its contents """ return self.csAPI.syncUsersWithCFG(usersCFG) ############################################################################# def csCommitChanges(self, sortUsers=True): """ Commit the changes in the CS """ return self.csAPI.commitChanges(sortUsers=False) ############################################################################# def sendMail(self, address, subject, body, fromAddress=None, localAttempt=True, html=False): """ Send mail to specified address with body. """ notification = NotificationClient() return notification.sendMail(address, subject, body, fromAddress, localAttempt, html) ############################################################################# def sendSMS(self, userName, body, fromAddress=None): """ Send mail to specified address with body. """ if len(body) > 160: return S_ERROR('Exceeded maximum SMS length of 160 characters') notification = NotificationClient() return notification.sendSMS(userName, body, fromAddress) ############################################################################# def getBDIISite(self, site, host=None): """ Get information about site from BDII at host """ return ldapSite(site, host=host) ############################################################################# def getBDIICluster(self, ce, host=None): """ Get information about ce from BDII at host """ return ldapCluster(ce, host=host) ############################################################################# def getBDIICE(self, ce, host=None): """ Get information about ce from BDII at host """ return ldapCE(ce, host=host) ############################################################################# def getBDIIService(self, ce, host=None): """ Get information about ce from BDII at host """ return ldapService(ce, host=host) ############################################################################# def getBDIICEState(self, ce, useVO=voName, host=None): """ Get information about ce state from BDII at host """ return ldapCEState(ce, useVO, host=host) ############################################################################# def getBDIICEVOView(self, ce, useVO=voName, host=None): """ Get information about ce voview from BDII at host """ return ldapCEVOView(ce, useVO, host=host) ############################################################################# def getBDIISE(self, site, useVO=voName, host=None): """ Get information about SA from BDII at host """ return ldapSE(site, useVO, host=host)
class Matcher(object): """ Logic for matching """ def __init__(self, pilotAgentsDB=None, jobDB=None, tqDB=None, jlDB=None, opsHelper=None): """ c'tor """ if pilotAgentsDB: self.pilotAgentsDB = pilotAgentsDB else: self.pilotAgentsDB = PilotAgentsDB() if jobDB: self.jobDB = jobDB else: self.jobDB = JobDB() if tqDB: self.tqDB = tqDB else: self.tqDB = TaskQueueDB() if jlDB: self.jlDB = jlDB else: self.jlDB = JobLoggingDB() if opsHelper: self.opsHelper = opsHelper else: self.opsHelper = Operations() self.log = gLogger.getSubLogger("Matcher") self.limiter = Limiter(jobDB=self.jobDB, opsHelper=self.opsHelper) self.siteClient = SiteStatus() def selectJob(self, resourceDescription, credDict): """ Main job selection function to find the highest priority job matching the resource capacity """ startTime = time.time() resourceDict = self._getResourceDict(resourceDescription, credDict) # Make a nice print of the resource matching parameters toPrintDict = dict(resourceDict) if "MaxRAM" in resourceDescription: toPrintDict['MaxRAM'] = resourceDescription['MaxRAM'] if "NumberOfProcessors" in resourceDescription: toPrintDict['NumberOfProcessors'] = resourceDescription[ 'NumberOfProcessors'] toPrintDict['Tag'] = [] if "Tag" in resourceDict: for tag in resourceDict['Tag']: if not tag.endswith('GB') and not tag.endswith('Processors'): toPrintDict['Tag'].append(tag) if not toPrintDict['Tag']: toPrintDict.pop('Tag') gLogger.info('Resource description for matching', printDict(toPrintDict)) negativeCond = self.limiter.getNegativeCondForSite( resourceDict['Site']) result = self.tqDB.matchAndGetJob(resourceDict, negativeCond=negativeCond) if not result['OK']: raise RuntimeError(result['Message']) result = result['Value'] if not result['matchFound']: self.log.info("No match found") return {} jobID = result['jobId'] resAtt = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup', 'Status']) if not resAtt['OK']: raise RuntimeError('Could not retrieve job attributes') if not resAtt['Value']: raise RuntimeError("No attributes returned for job") if not resAtt['Value']['Status'] == 'Waiting': self.log.error('Job matched by the TQ is not in Waiting state', str(jobID)) result = self.tqDB.deleteJob(jobID) if not result['OK']: raise RuntimeError(result['Message']) raise RuntimeError("Job %s is not in Waiting state" % str(jobID)) self._reportStatus(resourceDict, jobID) result = self.jobDB.getJobJDL(jobID) if not result['OK']: raise RuntimeError("Failed to get the job JDL") resultDict = {} resultDict['JDL'] = result['Value'] resultDict['JobID'] = jobID matchTime = time.time() - startTime self.log.info("Match time: [%s]" % str(matchTime)) gMonitor.addMark("matchTime", matchTime) # Get some extra stuff into the response returned resOpt = self.jobDB.getJobOptParameters(jobID) if resOpt['OK']: for key, value in resOpt['Value'].items(): resultDict[key] = value resAtt = self.jobDB.getJobAttributes(jobID, ['OwnerDN', 'OwnerGroup']) if not resAtt['OK']: raise RuntimeError('Could not retrieve job attributes') if not resAtt['Value']: raise RuntimeError('No attributes returned for job') if self.opsHelper.getValue("JobScheduling/CheckMatchingDelay", True): self.limiter.updateDelayCounters(resourceDict['Site'], jobID) pilotInfoReportedFlag = resourceDict.get('PilotInfoReportedFlag', False) if not pilotInfoReportedFlag: self._updatePilotInfo(resourceDict) self._updatePilotJobMapping(resourceDict, jobID) resultDict['DN'] = resAtt['Value']['OwnerDN'] resultDict['Group'] = resAtt['Value']['OwnerGroup'] resultDict['PilotInfoReportedFlag'] = True return resultDict def _getResourceDict(self, resourceDescription, credDict): """ from resourceDescription to resourceDict (just various mods) """ resourceDict = self._processResourceDescription(resourceDescription) resourceDict = self._checkCredentials(resourceDict, credDict) self._checkPilotVersion(resourceDict) if not self._checkMask(resourceDict): # Banned destinations can only take Test jobs resourceDict['JobType'] = 'Test' self.log.verbose("Resource description:") for key in resourceDict: self.log.verbose("%s : %s" % (key.rjust(20), resourceDict[key])) return resourceDict def _processResourceDescription(self, resourceDescription): """ Check and form the resource description dictionary resourceDescription is a ceDict coming from a JobAgent, for example. """ resourceDict = {} for name in singleValueDefFields: if name in resourceDescription: resourceDict[name] = resourceDescription[name] for name in multiValueMatchFields: if name in resourceDescription: resourceDict[name] = resourceDescription[name] for name in tagMatchFields: if name in resourceDescription and resourceDescription[name]: resourceDict[name] = resourceDescription[name] rname = 'Required%s' % name if rname in resourceDescription: resourceDict[rname] = resourceDescription[rname] if 'JobID' in resourceDescription: resourceDict['JobID'] = resourceDescription['JobID'] # Convert MaxRAM and NumberOfProcessors parameters into a list of tags maxRAM = resourceDescription.get('MaxRAM') if maxRAM: try: maxRAM = int(maxRAM) / 1000 except ValueError: maxRAM = None nProcessors = resourceDescription.get('NumberOfProcessors') if nProcessors: try: nProcessors = int(nProcessors) except ValueError: nProcessors = None for param, key in [(maxRAM, 'GB'), (nProcessors, 'Processors')]: if param and param <= 128: paramList = range(2, param + 1) paramTags = ['%d%s' % (par, key) for par in paramList] if paramTags: resourceDict.setdefault("Tag", []).extend(paramTags) if "WholeNode" in resourceDescription: resourceDict.setdefault("Tag", []).append("WholeNode") if 'Tag' in resourceDict: resourceDict['Tag'] = list(set(resourceDict['Tag'])) for k in ('DIRACVersion', 'ReleaseVersion', 'ReleaseProject', 'VirtualOrganization', 'PilotReference', 'PilotBenchmark', 'PilotInfoReportedFlag'): if k in resourceDescription: resourceDict[k] = resourceDescription[k] return resourceDict def _reportStatus(self, resourceDict, jobID): """ Reports the status of the matched job in jobDB and jobLoggingDB Do not fail if errors happen here """ attNames = ['Status', 'MinorStatus', 'ApplicationStatus', 'Site'] attValues = ['Matched', 'Assigned', 'Unknown', resourceDict['Site']] result = self.jobDB.setJobAttributes(jobID, attNames, attValues) if not result['OK']: self.log.error( "Problem reporting job status", "setJobAttributes, jobID = %s: %s" % (jobID, result['Message'])) else: self.log.verbose("Set job attributes for jobID %s" % jobID) result = self.jlDB.addLoggingRecord(jobID, status='Matched', minor='Assigned', source='Matcher') if not result['OK']: self.log.error( "Problem reporting job status", "addLoggingRecord, jobID = %s: %s" % (jobID, result['Message'])) else: self.log.verbose("Added logging record for jobID %s" % jobID) def _checkMask(self, resourceDict): """ Check the mask: are we allowed to run normal jobs? FIXME: should we move to site OR SE? """ if 'Site' not in resourceDict: self.log.error("Missing Site Name in Resource JDL") raise RuntimeError("Missing Site Name in Resource JDL") # Check if site is allowed result = self.siteClient.getUsableSites(resourceDict['Site']) if not result['OK']: self.log.error("Internal error", "siteClient.getUsableSites: %s" % result['Message']) raise RuntimeError("Internal error") if resourceDict['Site'] not in result['Value']: return False return True def _updatePilotInfo(self, resourceDict): """ Update pilot information - do not fail if we don't manage to do it """ pilotReference = resourceDict.get('PilotReference', '') if pilotReference: gridCE = resourceDict.get('GridCE', 'Unknown') site = resourceDict.get('Site', 'Unknown') benchmark = resourceDict.get('PilotBenchmark', 0.0) self.log.verbose( 'Reporting pilot info for %s: gridCE=%s, site=%s, benchmark=%f' % (pilotReference, gridCE, site, benchmark)) result = self.pilotAgentsDB.setPilotStatus(pilotReference, status='Running', gridSite=site, destination=gridCE, benchmark=benchmark) if not result['OK']: self.log.warn( "Problem updating pilot information", "; setPilotStatus. pilotReference: %s; %s" % (pilotReference, result['Message'])) def _updatePilotJobMapping(self, resourceDict, jobID): """ Update pilot to job mapping information """ pilotReference = resourceDict.get('PilotReference', '') if pilotReference: result = self.pilotAgentsDB.setCurrentJobID(pilotReference, jobID) if not result['OK']: self.log.error( "Problem updating pilot information", ";setCurrentJobID. pilotReference: %s; %s" % (pilotReference, result['Message'])) result = self.pilotAgentsDB.setJobForPilot(jobID, pilotReference, updateStatus=False) if not result['OK']: self.log.error( "Problem updating pilot information", "; setJobForPilot. pilotReference: %s; %s" % (pilotReference, result['Message'])) def _checkCredentials(self, resourceDict, credDict): """ Check if we can get a job given the passed credentials """ if Properties.GENERIC_PILOT in credDict['properties']: # You can only match groups in the same VO if credDict['group'] == "hosts": # for the host case the VirtualOrganization parameter # is mandatory in resourceDict vo = resourceDict.get('VirtualOrganization', '') else: vo = Registry.getVOForGroup(credDict['group']) result = Registry.getGroupsForVO(vo) if result['OK']: resourceDict['OwnerGroup'] = result['Value'] else: raise RuntimeError(result['Message']) else: # If it's a private pilot, the DN has to be the same if Properties.PILOT in credDict['properties']: self.log.notice( "Setting the resource DN to the credentials DN") resourceDict['OwnerDN'] = credDict['DN'] # If it's a job sharing. The group has to be the same and just check that the DN (if any) # belongs to the same group elif Properties.JOB_SHARING in credDict['properties']: resourceDict['OwnerGroup'] = credDict['group'] self.log.notice( "Setting the resource group to the credentials group") if 'OwnerDN' in resourceDict and resourceDict[ 'OwnerDN'] != credDict['DN']: ownerDN = resourceDict['OwnerDN'] result = Registry.getGroupsForDN(resourceDict['OwnerDN']) if not result['OK']: raise RuntimeError(result['Message']) if credDict['group'] not in result['Value']: # DN is not in the same group! bad boy. self.log.notice( "You cannot request jobs from DN %s. It does not belong to your group!" % ownerDN) resourceDict['OwnerDN'] = credDict['DN'] # Nothing special, group and DN have to be the same else: resourceDict['OwnerDN'] = credDict['DN'] resourceDict['OwnerGroup'] = credDict['group'] return resourceDict def _checkPilotVersion(self, resourceDict): """ Check the pilot DIRAC version """ if self.opsHelper.getValue("Pilot/CheckVersion", True): if 'ReleaseVersion' not in resourceDict: if 'DIRACVersion' not in resourceDict: raise RuntimeError( 'Version check requested and not provided by Pilot') else: pilotVersion = resourceDict['DIRACVersion'] else: pilotVersion = resourceDict['ReleaseVersion'] validVersions = self.opsHelper.getValue("Pilot/Version", []) if validVersions and pilotVersion not in validVersions: raise RuntimeError( 'Pilot version does not match the production version %s not in ( %s )' % (pilotVersion, ",".join(validVersions))) # Check project if requested validProject = self.opsHelper.getValue("Pilot/Project", "") if validProject: if 'ReleaseProject' not in resourceDict: raise RuntimeError( "Version check requested but expected project %s not received" % validProject) if resourceDict['ReleaseProject'] != validProject: raise RuntimeError( "Version check requested \ but expected project %s != received %s" % (validProject, resourceDict['ReleaseProject']))
class SiteInspectorAgent( AgentModule ): """ SiteInspectorAgent The SiteInspectorAgent agent is an agent that is used to get the all the site names and trigger PEP to evaluate their status. """ # Max number of worker threads by default __maxNumberOfThreads = 15 # Inspection freqs, defaults, the lower, the higher priority to be checked. # Error state usually means there is a glitch somewhere, so it has the highest # priority. __checkingFreqs = {'Active' : 20, 'Degraded' : 20, 'Probing' : 20, 'Banned' : 15, 'Unknown' : 10, 'Error' : 5} def __init__( self, *args, **kwargs ): AgentModule.__init__( self, *args, **kwargs ) # ElementType, to be defined among Site, Resource or Node self.sitesToBeChecked = None self.threadPool = None self.siteClient = None self.clients = {} def initialize( self ): """ Standard initialize. """ maxNumberOfThreads = self.am_getOption( 'maxNumberOfThreads', self.__maxNumberOfThreads ) self.threadPool = ThreadPool( maxNumberOfThreads, maxNumberOfThreads ) self.siteClient = SiteStatus() self.clients['SiteStatus'] = self.siteClient self.clients['ResourceManagementClient'] = ResourceManagementClient() return S_OK() def execute( self ): """ execute This is the main method of the agent. It gets the sites from the Database, calculates how many threads should be started and spawns them. Each thread will get a site from the queue until it is empty. At the end, the method will join the queue such that the agent will not terminate a cycle until all sites have been processed. """ # Gets sites to be checked ( returns a Queue ) sitesToBeChecked = self.getSitesToBeChecked() if not sitesToBeChecked['OK']: self.log.error( sitesToBeChecked['Message'] ) return sitesToBeChecked self.sitesToBeChecked = sitesToBeChecked['Value'] queueSize = self.sitesToBeChecked.qsize() pollingTime = self.am_getPollingTime() # Assigns number of threads on the fly such that we exhaust the PollingTime # without having to spawn too many threads. We assume 10 seconds per element # to be processed ( actually, it takes something like 1 sec per element ): # numberOfThreads = elements * 10(s/element) / pollingTime numberOfThreads = int( math.ceil( queueSize * 10. / pollingTime ) ) self.log.info( 'Needed %d threads to process %d elements' % ( numberOfThreads, queueSize ) ) for _x in xrange( numberOfThreads ): jobUp = self.threadPool.generateJobAndQueueIt( self._execute ) if not jobUp['OK']: self.log.error( jobUp['Message'] ) self.log.info( 'blocking until all sites have been processed' ) # block until all tasks are done self.sitesToBeChecked.join() self.log.info( 'done') return S_OK() def getSitesToBeChecked( self ): """ getElementsToBeChecked This method gets all the site names from the SiteStatus table, after that it get the details of each site (status, name, etc..) and adds them to a queue. """ toBeChecked = Queue.Queue() res = self.siteClient.getSites('All') if not res['OK']: return res # get the current status res = self.siteClient.getSiteStatuses( res['Value'] ) if not res['OK']: return res # filter elements for site in res['Value']: status = res['Value'].get(site, 'Unknown') toBeChecked.put( { 'status': status, 'name': site, 'site' : site, 'element' : 'Site', 'statusType': 'all', 'elementType': 'Site' } ) return S_OK( toBeChecked ) # Private methods ............................................................ def _execute( self ): """ Method run by each of the thread that is in the ThreadPool. It enters a loop until there are no sites on the queue. On each iteration, it evaluates the policies for such site and enforces the necessary actions. If there are no more sites in the queue, the loop is finished. """ pep = PEP( clients = self.clients ) while True: try: site = self.sitesToBeChecked.get_nowait() except Queue.Empty: return S_OK() resEnforce = pep.enforce( site ) if not resEnforce['OK']: self.log.error( 'Failed policy enforcement', resEnforce['Message'] ) self.sitesToBeChecked.task_done() continue # Used together with join ! self.sitesToBeChecked.task_done()
class ResourceStatus( ElementStatus ): """ ResourceStatus helper that connects to CS if RSS flag is not Active. It keeps the connection to the db / server as an object member, to avoid creating a new one massively. """ __metaclass__ = DIRACSingleton def __init__( self ): """ Constructor, initializes the logger, rssClient and caches. examples >>> resourceStatus = ResourceStatus() """ super( ResourceStatus, self ).__init__() self.siteStatus = SiteStatus() # We can set CacheLifetime and CacheHistory from CS, so that we can tune them. cacheLifeTime = int( RssConfiguration().getConfigCache() ) # RSSCaches, one per elementType ( StorageElement, ComputingElement ) # Should be generated on the fly, instead of being hardcoded ? self.seCache = RSSCache( 'Storage', cacheLifeTime, self._updateSECache ) self.ceCache = RSSCache( 'Computing', cacheLifeTime, self._updateCECache ) #............................................................................. # ComputingElement methods def getComputingStatuses( self, ceNames, statusTypes = None ): """ Method that queries the RSSCache for ComputingElement-Status-related information. If any of the inputs is None, it is interpreted as * ( all ). If match is positive, the output looks like: { computingElementA : { statusType1 : status1, statusType2 : status2 }, computingElementB : { statusType1 : status1, statusType2 : status2 }, } There are ALWAYS the same keys inside the site dictionaries. examples: >>> resourceStatus.getComputingStatuses( 'ce207.cern.ch', None ) S_OK( { 'ce207.cern.ch' : { 'all' : 'Active' } } ) >>> resourceStatus.getComputingStatuses( 'RubbishCE', None ) S_ERROR( ... ) >>> resourceStaus.getComputingStatuses( 'ce207.cern.ch', 'all' ) S_OK( { 'ce207.cern.ch' : { 'all' : 'Active' } } ) >>> resourceStatus.getComputingStatuses( [ 'ce206.cern.ch', 'ce207.cern.ch' ], 'all' ) S_OK( { 'ce206.cern.ch' : { 'all' : 'Active' }, 'ce207.cern.ch' : { 'all' : 'Active' } } ) >>> resourceStatus.getComputingStatuses( None, 'all' ) S_OK( { 'ce206.cern.ch' : { 'all' : 'Active' }, 'ce207.cern.ch' : { 'all' : 'Active' }, ... } ) :Parameters: **ceNames** - [ None, `string`, `list` ] name(s) of the computing elements to be matched **statusTypes** - [ None, `string`, `list` ] name(s) of the statusTypes to be matched :return: S_OK() || S_ERROR() """ cacheMatch = self.ceCache.match( ceNames, statusTypes ) if not cacheMatch[ 'OK' ]: return cacheMatch cacheMatch = cacheMatch[ 'Value' ] for ceName, ceDict in cacheMatch.items(): if not self.__getSiteAccess( ceName, 'ComputingAccess' )[ 'OK' ]: cacheMatch[ ceName ] = dict( zip( ceDict.keys(), [ 'Banned' ] * len( ceDict ) ) ) return S_OK( cacheMatch ) def getComputingStatus( self, ceName, statusType ): """ Given a ce and a statusType, it returns its status from the cache. examples: >>> resourceStatus.getComputingStatus( 'ce207.cern.ch', 'all' ) S_OK( 'Active' ) >>> resourceStatus.getComputingStatus( 'ce207.cern.ch', None ) S_ERROR( ... ) :Parameters: **ceName** - `string` name of the computing element to be matched **statusType** - `string` name of the statusType to be matched :return: S_OK() || S_ERROR() """ return self.getElementStatus( 'Computing', ceName, statusType ) def isUsableComputing( self, ceName, statusType ): """ Similar method to getComputingStatus. The difference is the output. Given a ce name, returns a bool if the ce is usable: status is Active or Degraded outputs True anything else outputs False examples: >>> resourceStatus.isUsableComputing( 'ce207.cern.ch', 'all' ) True >>> resourceStatus.isUsableComputing( 'ce207.cern.ch', 'all' ) False # May be banned >>> resourceStatus.isUsableComputing( 'ce207.cern.ch', None ) False >>> resourceStatus.isUsableComputing( 'RubbishCE', 'all' ) False >>> resourceStatus.isUsableComputing( 'ce207.cern.ch', 'RubbishAccess' ) False :Parameters: **ceName** - `string` name of the computing element to be matched **statusType** - `string` name of the statusType to be matched :return: S_OK() || S_ERROR() """ return self.isUsableElement( 'Computing', ceName, statusType ) def getUsableComputings( self, statusType ): """ For a given statusType, returns all computing elements that are usable: their status for that particular statusType is either Active or Degraded; in a list. examples: >>> resourceStatus.getUsableComputings( 'all' ) S_OK( [ 'ce206.cern.ch', 'ce207.cern.ch',... ] ) >>> resourceStatus.getUsableComputings( None ) S_ERROR( ... ) >>> resourceStatus.getUsableComputings( 'RubbishAccess' ) S_ERROR( ... ) :Parameters: **statusType** - `string` name of the statusType to be matched :return: S_OK() || S_ERROR() """ return self.getUsableElements( 'Computing', statusType ) #............................................................................. # StorageElement methods def getStorageStatuses( self, seNames, statusTypes = None ): """ Method that queries the RSSCache for StorageElement-Status-related information. If any of the inputs is None, it is interpreted as * ( all ). If match is positive, the output looks like: { storageElementA : { statusType1 : status1, statusType2 : status2 }, storageElementB : { statusType1 : status1, statusType2 : status2 }, } There are ALWAYS the same keys inside the site dictionaries. examples: >>> resourceStatus.getStorageStatuses( 'CERN-USER', None ) S_OK( { 'CERN-USER' : { 'ReadAccess' : 'Active', 'WriteAccess' : 'Degraded',... } } ) >>> resourceStatus.getStorageStatuses( 'RubbishCE', None ) S_ERROR( ... ) >>> resourceStaus.getStorageStatuses( 'CERN-USER', 'ReadAccess' ) S_OK( { 'CERN-USER' : { 'ReadAccess' : 'Active' } } ) >>> resourceStatus.getStorageStatuses( [ 'CERN-USER', 'PIC-USER' ], 'ReadAccess' ) S_OK( { 'CERN-USER' : { 'ReadAccess' : 'Active' }, 'PIC-USER' : { 'ReadAccess' : 'Active' } } ) >>> resourceStatus.getStorageStatuses( None, 'ReadAccess' ) S_OK( { 'CERN-USER' : { 'ReadAccess' : 'Active' }, 'PIC-USER' : { 'ReadAccess' : 'Active' }, ... } ) :Parameters: **seNames** - [ None, `string`, `list` ] name(s) of the storage elements to be matched **statusTypes** - [ None, `string`, `list` ] name(s) of the statusTypes to be matched :return: S_OK() || S_ERROR() """ cacheMatch = self.seCache.match( seNames, statusTypes ) if not cacheMatch[ 'OK' ]: return cacheMatch cacheMatch = cacheMatch[ 'Value' ] for seName, seDict in cacheMatch.items(): if not self.__getSiteAccess( seName, 'StorageAccess' )[ 'OK' ]: cacheMatch[ seName ] = dict( zip( seDict.keys(), [ 'Banned' ] * len( seDict ) ) ) return S_OK( cacheMatch ) def getStorageStatus( self, seName, statusType ): """ Given a se and a statusType, it returns its status from the cache. examples: >>> resourceStatus.getComputingElementStatus( 'CERN-USER', 'ReadAccess' ) S_OK( 'Active' ) >>> resourceStatus.getComputingElementStatus( 'CERN-USER', None ) S_ERROR( ... ) :Parameters: **seName** - `string` name of the storage element to be matched **statusType** - `string` name of the statusType to be matched :return: S_OK() || S_ERROR() """ return self.getElementStatus( 'Storage', seName, statusType ) def isUsableStorage( self, seName, statusType ): """ Similar method to getStorageStatus. The difference is the output. Given a se name, returns a bool if the se is usable: status is Active or Degraded outputs True anything else outputs False examples: >>> resourceStatus.isUsableStorage( 'CERN-USER', 'ReadAccess' ) True >>> resourceStatus.isUsableStorage( 'CERN-ARCHIVE', 'ReadAccess' ) False # May be banned >>> resourceStatus.isUsableStorage( 'CERN-USER', None ) False >>> resourceStatus.isUsableStorage( 'RubbishCE', 'ReadAccess' ) False >>> resourceStatus.isUsableStorage( 'CERN-USER', 'RubbishAccess' ) False :Parameters: **seName** - `string` name of the storage element to be matched **statusType** - `string` name of the statusType to be matched :return: S_OK() || S_ERROR() """ return self.isUsableElement( 'Storage', seName, statusType ) def getUsableStorages( self, statusType ): """ For a given statusType, returns all storage elements that are usable: their status for that particular statusType is either Active or Degraded; in a list. examples: >>> resourceStatus.getUsableStorages( 'ReadAccess' ) S_OK( [ 'CERN-USER', 'PIC-USER',... ] ) >>> resourceStatus.getUsableStorages( None ) S_ERROR( ... ) >>> resourceStatus.getUsableStorages( 'RubbishAccess' ) S_ERROR( ... ) :Parameters: **statusType** - `string` name of the statusType to be matched :return: S_OK() || S_ERROR() """ return self.getUsableElements( 'Storage', statusType ) #............................................................................. # update Cache methods def _updateCECache( self ): """ Method used to update the ComputingElementCache. """ return self.__updateCache( 'Computing' ) def _updateSECache( self ): """ Method used to update the StorageElementCache. """ return self.__updateCache( 'Storage' ) #............................................................................. # Private methods def __updateCache( self, elementType ): meta = { 'columns' : [ 'Name', 'StatusType', 'Status' ] } rawCache = self.rssClient.selectStatusElement( 'Resource', 'Status', elementType = elementType, meta = meta ) if not rawCache[ 'OK' ]: return rawCache return S_OK( self.getCacheDictFromRawData( rawCache[ 'Value' ] ) ) def __getSiteAccess( self, elementName, siteAccess ): """ Method that given a resourceType and an elementName, finds the site name that owes it. Once that is done, the site access <siteAccess> is checked and returned. :Parameters: **resourceType** - `string` name of the resource type ( StorageElement, ComputingElement.. ) **elementName** - `string` name of the resource of type <resourceType> **siteAccess** - `string` site access ( StorageAccess, ComputingAccess .. ) :return: S_OK() || S_ERROR() """ siteName = Resources.getSiteForResource( elementName ) if not siteName[ 'OK' ]: return siteName siteName = siteName[ 'Value' ] if not self.siteStatus.isUsableSite( siteName, siteAccess ): return S_ERROR( 'Site %s is not usable for Computing' % siteName ) return S_OK() ################################################################################ #EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF
class SiteDirector( AgentModule ): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ def initialize( self ): """ Standard constructor """ self.am_setOption( "PollingTime", 60.0 ) self.am_setOption( "maxPilotWaitingHours", 6 ) self.queueDict = {} self.maxJobsInFillMode = MAX_JOBS_IN_FILLMODE self.maxPilotsToSubmit = MAX_PILOTS_TO_SUBMIT self.siteStatus = SiteStatus() return S_OK() def beginExecution( self ): self.gridEnv = self.am_getOption( "GridEnv", getGridEnv() ) # The SiteDirector is for a particular user community self.vo = self.am_getOption( "Community", '' ) if not self.vo: self.vo = CSGlobals.getVO() # The SiteDirector is for a particular user group self.group = self.am_getOption( "Group", '' ) # self.voGroups contain all the eligible user groups for pilots submutted by this SiteDirector self.voGroups = [] # Choose the group for which pilots will be submitted. This is a hack until # we will be able to match pilots to VOs. if not self.group: if self.vo: result = Registry.getGroupsForVO( self.vo ) if not result['OK']: return result for group in result['Value']: if 'NormalUser' in Registry.getPropertiesForGroup( group ): self.voGroups.append( group ) else: self.voGroups = [ self.group ] result = findGenericPilotCredentials( vo = self.vo ) if not result[ 'OK' ]: return result self.pilotDN, self.pilotGroup = result[ 'Value' ] self.pilotDN = self.am_getOption( "PilotDN", self.pilotDN ) self.pilotGroup = self.am_getOption( "PilotGroup", self.pilotGroup ) self.platforms = [] self.sites = [] self.defaultSubmitPools = '' if self.group: self.defaultSubmitPools = Registry.getGroupOption( self.group, 'SubmitPools', '' ) elif self.vo: self.defaultSubmitPools = Registry.getVOOption( self.vo, 'SubmitPools', '' ) self.pilot = self.am_getOption( 'PilotScript', DIRAC_PILOT ) self.install = DIRAC_INSTALL self.workingDirectory = self.am_getOption( 'WorkDirectory' ) self.maxQueueLength = self.am_getOption( 'MaxQueueLength', 86400 * 3 ) self.pilotLogLevel = self.am_getOption( 'PilotLogLevel', 'INFO' ) self.maxJobsInFillMode = self.am_getOption( 'MaxJobsInFillMode', self.maxJobsInFillMode ) self.maxPilotsToSubmit = self.am_getOption( 'MaxPilotsToSubmit', self.maxPilotsToSubmit ) self.pilotWaitingFlag = self.am_getOption( 'PilotWaitingFlag', True ) self.pilotWaitingTime = self.am_getOption( 'MaxPilotWaitingTime', 7200 ) # Flags self.updateStatus = self.am_getOption( 'UpdatePilotStatus', True ) self.getOutput = self.am_getOption( 'GetPilotOutput', True ) self.sendAccounting = self.am_getOption( 'SendPilotAccounting', True ) # Get the site description dictionary siteNames = None if not self.am_getOption( 'Site', 'Any' ).lower() == "any": siteNames = self.am_getOption( 'Site', [] ) ceTypes = None if not self.am_getOption( 'CETypes', 'Any' ).lower() == "any": ceTypes = self.am_getOption( 'CETypes', [] ) ces = None if not self.am_getOption( 'CEs', 'Any' ).lower() == "any": ces = self.am_getOption( 'CEs', [] ) self._resources = Resources.Resources( vo = self.vo ) result = self._resources.getEligibleQueuesInfo( siteList = siteNames, ceList = ces, ceTypeList = ceTypes, mode = 'Direct' ) if not result['OK']: return result resourceDict = result['Value'] result = self.getQueues( resourceDict ) if not result['OK']: return result #if not siteNames: # siteName = gConfig.getValue( '/DIRAC/Site', 'Unknown' ) # if siteName == 'Unknown': # return S_OK( 'No site specified for the SiteDirector' ) # else: # siteNames = [siteName] #self.siteNames = siteNames if self.updateStatus: self.log.always( 'Pilot status update requested' ) if self.getOutput: self.log.always( 'Pilot output retrieval requested' ) if self.sendAccounting: self.log.always( 'Pilot accounting sending requested' ) self.log.always( 'Sites:', siteNames ) self.log.always( 'CETypes:', ceTypes ) self.log.always( 'CEs:', ces ) self.log.always( 'PilotDN:', self.pilotDN ) self.log.always( 'PilotGroup:', self.pilotGroup ) self.log.always( 'MaxPilotsToSubmit:', self.maxPilotsToSubmit ) self.log.always( 'MaxJobsInFillMode:', self.maxJobsInFillMode ) self.localhost = socket.getfqdn() self.proxy = '' if self.queueDict: self.log.always( "Agent will serve queues:" ) for queue in self.queueDict: self.log.always( "Site: %s, CE: %s, Queue: %s" % ( self.queueDict[queue]['Site'], self.queueDict[queue]['CEName'], queue ) ) return S_OK() def getQueues( self, resourceDict ): """ Get the list of relevant CEs and their descriptions """ self.queueDict = {} ceFactory = ComputingElementFactory() for site in resourceDict: result = self._resources.getSiteFullName( site ) if not result['OK']: continue siteFullName = result['Value'] for ce in resourceDict[site]: ceDict = resourceDict[site][ce] qDict = ceDict.pop( 'Queues' ) for queue in qDict: queueName = '%s_%s' % ( ce, queue ) self.queueDict[queueName] = {} self.queueDict[queueName]['ParametersDict'] = qDict[queue] self.queueDict[queueName]['ParametersDict']['Queue'] = queue self.queueDict[queueName]['ParametersDict']['Site'] = siteFullName self.queueDict[queueName]['ParametersDict']['GridEnv'] = self.gridEnv self.queueDict[queueName]['ParametersDict']['Setup'] = gConfig.getValue( '/DIRAC/Setup', 'unknown' ) # Evaluate the CPU limit of the queue according to the Glue convention # To Do: should be a utility if "maxCPUTime" in self.queueDict[queueName]['ParametersDict'] and \ "SI00" in self.queueDict[queueName]['ParametersDict']: maxCPUTime = float( self.queueDict[queueName]['ParametersDict']['maxCPUTime'] ) # For some sites there are crazy values in the CS maxCPUTime = max( maxCPUTime, 0 ) maxCPUTime = min( maxCPUTime, 86400 * 12.5 ) si00 = float( self.queueDict[queueName]['ParametersDict']['SI00'] ) queueCPUTime = 60. / 250. * maxCPUTime * si00 self.queueDict[queueName]['ParametersDict']['CPUTime'] = int( queueCPUTime ) qwDir = os.path.join( self.workingDirectory, queue ) if not os.path.exists( qwDir ): os.makedirs( qwDir ) self.queueDict[queueName]['ParametersDict']['WorkingDirectory'] = qwDir platform = '' if "Platform" in self.queueDict[queueName]['ParametersDict']: platform = self.queueDict[queueName]['ParametersDict']['Platform'] elif "Platform" in ceDict: platform = ceDict['Platform'] elif "OS" in ceDict: architecture = ceDict.get( 'architecture', 'x86_64' ) OS = ceDict['OS'] platform = '_'.join( [architecture, OS] ) if platform and not platform in self.platforms: self.platforms.append( platform ) if not "Platform" in self.queueDict[queueName]['ParametersDict'] and platform: result = Resources.getDIRACPlatform( platform ) if result['OK']: self.queueDict[queueName]['ParametersDict']['Platform'] = result['Value'] ceQueueDict = dict( ceDict ) ceQueueDict.update( self.queueDict[queueName]['ParametersDict'] ) result = ceFactory.getCE( ceName = ce, ceType = ceDict['CEType'], ceParametersDict = ceQueueDict ) if not result['OK']: return result self.queueDict[queueName]['CE'] = result['Value'] self.queueDict[queueName]['CEName'] = ce self.queueDict[queueName]['CEType'] = ceDict['CEType'] self.queueDict[queueName]['Site'] = siteFullName self.queueDict[queueName]['QueueName'] = queue self.queueDict[queueName]['Platform'] = platform result = self.queueDict[queueName]['CE'].isValid() if not result['OK']: self.log.fatal( result['Message'] ) return result if 'BundleProxy' in self.queueDict[queueName]['ParametersDict']: self.queueDict[queueName]['BundleProxy'] = True elif 'BundleProxy' in ceDict: self.queueDict[queueName]['BundleProxy'] = True if siteFullName not in self.sites: self.sites.append( siteFullName ) return S_OK() def execute( self ): """ Main execution method """ if not self.queueDict: self.log.warn( 'No site defined, exiting the cycle' ) return S_OK() result = self.submitJobs() if not result['OK']: self.log.error( 'Errors in the job submission: ', result['Message'] ) if self.updateStatus: result = self.updatePilotStatus() if not result['OK']: self.log.error( 'Errors in updating pilot status: ', result['Message'] ) return S_OK() def submitJobs( self ): """ Go through defined computing elements and submit jobs if necessary """ # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = { 'Setup':setup, 'CPUTime': 9999999, 'SubmitPool' : self.defaultSubmitPools } if self.vo: tqDict['Community'] = self.vo if self.voGroups: tqDict['OwnerGroup'] = self.voGroups result = Resources.getCompatiblePlatforms( self.platforms ) if not result['OK']: return result tqDict['Platform'] = result['Value'] tqDict['Site'] = self.sites self.log.verbose( 'Checking overall TQ availability with requirements' ) self.log.verbose( tqDict ) rpcMatcher = RPCClient( "WorkloadManagement/Matcher" ) result = rpcMatcher.getMatchingTaskQueues( tqDict ) if not result[ 'OK' ]: return result if not result['Value']: self.log.verbose( 'No Waiting jobs suitable for the director' ) return S_OK() queues = self.queueDict.keys() random.shuffle( queues ) for queue in queues: ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] ceType = self.queueDict[queue]['CEType'] queueName = self.queueDict[queue]['QueueName'] siteName = self.queueDict[queue]['Site'] siteMask = self.siteStatus.isUsableSite( siteName, 'ComputingAccess' ) platform = self.queueDict[queue]['Platform'] if 'CPUTime' in self.queueDict[queue]['ParametersDict'] : queueCPUTime = int( self.queueDict[queue]['ParametersDict']['CPUTime'] ) else: self.log.warn( 'CPU time limit is not specified for queue %s, skipping...' % queue ) continue if queueCPUTime > self.maxQueueLength: queueCPUTime = self.maxQueueLength # Get the working proxy cpuTime = queueCPUTime + 86400 self.log.verbose( "Getting pilot proxy for %s/%s %d long" % ( self.pilotDN, self.pilotGroup, cpuTime ) ) result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, cpuTime ) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy( self.proxy, cpuTime - 60 ) # Get the number of available slots on the target site/queue result = ce.available() if not result['OK']: self.log.warn( 'Failed to check the availability of queue %s: \n%s' % ( queue, result['Message'] ) ) continue ceInfoDict = result['CEInfoDict'] self.log.info( "CE queue report(%s_%s): Wait=%d, Run=%d, Submitted=%d, Max=%d" % \ ( ceName, queueName, ceInfoDict['WaitingJobs'], ceInfoDict['RunningJobs'], ceInfoDict['SubmittedJobs'], ceInfoDict['MaxTotalJobs'] ) ) totalSlots = result['Value'] ceDict = ce.getParameterDict() ceDict[ 'GridCE' ] = ceName if not siteMask and 'Site' in ceDict: self.log.info( 'Site not in the mask %s' % siteName ) self.log.info( 'Removing "Site" from matching Dict' ) del ceDict[ 'Site' ] if self.vo: ceDict['Community'] = self.vo if self.voGroups: ceDict['OwnerGroup'] = self.voGroups # This is a hack to get rid of ! ceDict['SubmitPool'] = self.defaultSubmitPools result = Resources.getCompatiblePlatforms( platform ) if not result['OK']: continue ceDict['Platform'] = result['Value'] # Get the number of eligible jobs for the target site/queue result = rpcMatcher.getMatchingTaskQueues( ceDict ) if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message'] ) return result taskQueueDict = result['Value'] if not taskQueueDict: self.log.info( 'No matching TQs found' ) continue totalTQJobs = 0 tqIDList = taskQueueDict.keys() for tq in taskQueueDict: totalTQJobs += taskQueueDict[tq]['Jobs'] pilotsToSubmit = min( totalSlots, totalTQJobs ) # Get the number of already waiting pilots for this queue totalWaitingPilots = 0 if self.pilotWaitingFlag: lastUpdateTime = dateTime() - self.pilotWaitingTime * second result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList, 'Status': WAITING_PILOT_STATUS }, None, lastUpdateTime ) if not result['OK']: self.log.error( 'Failed to get Number of Waiting pilots', result['Message'] ) totalWaitingPilots = 0 else: totalWaitingPilots = result['Value'] self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % tqIDList, totalWaitingPilots ) pilotsToSubmit = max( 0, min( totalSlots, totalTQJobs - totalWaitingPilots ) ) self.log.info( 'Available slots=%d, TQ jobs=%d, Waiting Pilots=%d, Pilots to submit=%d' % \ ( totalSlots, totalTQJobs, totalWaitingPilots, pilotsToSubmit ) ) # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT pilotsToSubmit = min( self.maxPilotsToSubmit, pilotsToSubmit ) while pilotsToSubmit > 0: self.log.info( 'Going to submit %d pilots to %s queue' % ( pilotsToSubmit, queue ) ) bundleProxy = self.queueDict[queue].get( 'BundleProxy', False ) jobExecDir = '' if ceType == 'CREAM': jobExecDir = '.' jobExecDir = self.queueDict[queue].get( 'JobExecDir', jobExecDir ) httpProxy = self.queueDict[queue].get( 'HttpProxy', '' ) result = self.__getExecutable( queue, pilotsToSubmit, bundleProxy, httpProxy, jobExecDir ) if not result['OK']: return result executable, pilotSubmissionChunk = result['Value'] result = ce.submitJob( executable, '', pilotSubmissionChunk ) os.unlink( executable ) if not result['OK']: self.log.error( 'Failed submission to queue %s:\n' % queue, result['Message'] ) pilotsToSubmit = 0 continue pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the # task queue priorities pilotList = result['Value'] self.log.info( 'Submitted %d pilots to %s@%s' % ( len( pilotList ), queueName, ceName ) ) stampDict = {} if result.has_key( 'PilotStampDict' ): stampDict = result['PilotStampDict'] tqPriorityList = [] sumPriority = 0. for tq in taskQueueDict: sumPriority += taskQueueDict[tq]['Priority'] tqPriorityList.append( ( tq, sumPriority ) ) rndm = random.random()*sumPriority tqDict = {} for pilotID in pilotList: rndm = random.random()*sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if not tqDict.has_key( tqID ): tqDict[tqID] = [] tqDict[tqID].append( pilotID ) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference( pilotList, tqID, self.pilotDN, self.pilotGroup, self.localhost, ceType, '', stampDict ) if not result['OK']: self.log.error( 'Failed add pilots to the PilotAgentsDB: ', result['Message'] ) continue for pilot in pilotList: result = pilotAgentsDB.setPilotStatus( pilot, 'Submitted', ceName, 'Successfully submitted by the SiteDirector', siteName, queueName ) if not result['OK']: self.log.error( 'Failed to set pilot status: ', result['Message'] ) continue return S_OK() ##################################################################################### def __getExecutable( self, queue, pilotsToSubmit, bundleProxy = True, httpProxy = '', jobExecDir = '' ): """ Prepare the full executable for queue """ proxy = None if bundleProxy: proxy = self.proxy pilotOptions, pilotsToSubmit = self.__getPilotOptions( queue, pilotsToSubmit ) if pilotOptions is None: return S_ERROR( 'Errors in compiling pilot options' ) executable = self.__writePilotScript( self.workingDirectory, pilotOptions, proxy, httpProxy, jobExecDir ) return S_OK( [ executable, pilotsToSubmit ] ) ##################################################################################### def __getPilotOptions( self, queue, pilotsToSubmit ): """ Prepare pilot options """ queueDict = self.queueDict[queue]['ParametersDict'] pilotOptions = [] setup = gConfig.getValue( "/DIRAC/Setup", "unknown" ) if setup == 'unknown': self.log.error( 'Setup is not defined in the configuration' ) return [ None, None ] pilotOptions.append( '-S %s' % setup ) opsHelper = Operations.Operations( group = self.pilotGroup, setup = setup ) #Installation defined? installationName = opsHelper.getValue( "Pilot/Installation", "" ) if installationName: pilotOptions.append( '-V %s' % installationName ) #Project defined? projectName = opsHelper.getValue( "Pilot/Project", "" ) if projectName: pilotOptions.append( '-l %s' % projectName ) else: self.log.info( 'DIRAC project will be installed by pilots' ) #Request a release diracVersion = opsHelper.getValue( "Pilot/Version", [] ) if not diracVersion: self.log.error( 'Pilot/Version is not defined in the configuration' ) return [ None, None ] #diracVersion is a list of accepted releases. Just take the first one pilotOptions.append( '-r %s' % diracVersion[0] ) ownerDN = self.pilotDN ownerGroup = self.pilotGroup # Request token for maximum pilot efficiency result = gProxyManager.requestToken( ownerDN, ownerGroup, pilotsToSubmit * self.maxJobsInFillMode ) if not result[ 'OK' ]: self.log.error( 'Invalid proxy token request', result['Message'] ) return [ None, None ] ( token, numberOfUses ) = result[ 'Value' ] pilotOptions.append( '-o /Security/ProxyToken=%s' % token ) # Use Filling mode pilotOptions.append( '-M %s' % min( numberOfUses, self.maxJobsInFillMode ) ) # Since each pilot will execute min( numberOfUses, self.maxJobsInFillMode ) # with numberOfUses tokens we can submit at most: # numberOfUses / min( numberOfUses, self.maxJobsInFillMode ) # pilots newPilotsToSubmit = numberOfUses / min( numberOfUses, self.maxJobsInFillMode ) if newPilotsToSubmit != pilotsToSubmit: self.log.info( 'Number of pilots to submit is changed to %d after getting the proxy token' % newPilotsToSubmit ) pilotsToSubmit = newPilotsToSubmit # Debug if self.pilotLogLevel.lower() == 'debug': pilotOptions.append( '-d' ) # CS Servers csServers = gConfig.getValue( "/DIRAC/Configuration/Servers", [] ) pilotOptions.append( '-C %s' % ",".join( csServers ) ) # DIRAC Extensions to be used in pilots pilotExtensionsList = opsHelper.getValue( "Pilot/Extensions", [] ) extensionsList = [] if pilotExtensionsList: if pilotExtensionsList[0] != 'None': extensionsList = pilotExtensionsList else: extensionsList = CSGlobals.getCSExtensions() if extensionsList: pilotOptions.append( '-e %s' % ",".join( extensionsList ) ) # Requested CPU time pilotOptions.append( '-T %s' % queueDict['CPUTime'] ) # CEName pilotOptions.append( '-N %s' % self.queueDict[queue]['CEName'] ) # SiteName pilotOptions.append( '-n %s' % queueDict['Site'] ) if 'ClientPlatform' in queueDict: pilotOptions.append( "-p '%s'" % queueDict['ClientPlatform'] ) if 'SharedArea' in queueDict: pilotOptions.append( "-o '/LocalSite/SharedArea=%s'" % queueDict['SharedArea'] ) if 'SI00' in queueDict: factor = float( queueDict['SI00'] ) / 250. pilotOptions.append( "-o '/LocalSite/CPUScalingFactor=%s'" % factor ) pilotOptions.append( "-o '/LocalSite/CPUNormalizationFactor=%s'" % factor ) else: if 'CPUScalingFactor' in queueDict: pilotOptions.append( "-o '/LocalSite/CPUScalingFactor=%s'" % queueDict['CPUScalingFactor'] ) if 'CPUNormalizationFactor' in queueDict: pilotOptions.append( "-o '/LocalSite/CPUNormalizationFactor=%s'" % queueDict['CPUNormalizationFactor'] ) # Hack if self.defaultSubmitPools: pilotOptions.append( '-o /Resources/Computing/CEDefaults/SubmitPool=%s' % self.defaultSubmitPools ) if self.group: pilotOptions.append( '-G %s' % self.group ) self.log.verbose( "pilotOptions: ", ' '.join( pilotOptions ) ) return [ pilotOptions, pilotsToSubmit ] ##################################################################################### def __writePilotScript( self, workingDirectory, pilotOptions, proxy = None, httpProxy = '', pilotExecDir = '' ): """ Bundle together and write out the pilot executable script, admixt the proxy if given """ try: compressedAndEncodedProxy = '' proxyFlag = 'False' if proxy is not None: compressedAndEncodedProxy = base64.encodestring( bz2.compress( proxy.dumpAllToString()['Value'] ) ) proxyFlag = 'True' compressedAndEncodedPilot = base64.encodestring( bz2.compress( open( self.pilot, "rb" ).read(), 9 ) ) compressedAndEncodedInstall = base64.encodestring( bz2.compress( open( self.install, "rb" ).read(), 9 ) ) except: self.log.exception( 'Exception during file compression of proxy, dirac-pilot or dirac-install' ) return S_ERROR( 'Exception during file compression of proxy, dirac-pilot or dirac-install' ) localPilot = """#!/bin/bash /usr/bin/env python << EOF # import os, tempfile, sys, shutil, base64, bz2 try: pilotExecDir = '%(pilotExecDir)s' if not pilotExecDir: pilotExecDir = None pilotWorkingDirectory = tempfile.mkdtemp( suffix = 'pilot', prefix = 'DIRAC_', dir = pilotExecDir ) pilotWorkingDirectory = os.path.realpath( pilotWorkingDirectory ) os.chdir( pilotWorkingDirectory ) if %(proxyFlag)s: open( 'proxy', "w" ).write(bz2.decompress( base64.decodestring( \"\"\"%(compressedAndEncodedProxy)s\"\"\" ) ) ) os.chmod("proxy",0600) os.environ["X509_USER_PROXY"]=os.path.join(pilotWorkingDirectory, 'proxy') open( '%(pilotScript)s', "w" ).write(bz2.decompress( base64.decodestring( \"\"\"%(compressedAndEncodedPilot)s\"\"\" ) ) ) open( '%(installScript)s', "w" ).write(bz2.decompress( base64.decodestring( \"\"\"%(compressedAndEncodedInstall)s\"\"\" ) ) ) os.chmod("%(pilotScript)s",0700) os.chmod("%(installScript)s",0700) if "LD_LIBRARY_PATH" not in os.environ: os.environ["LD_LIBRARY_PATH"]="" if "%(httpProxy)s": os.environ["HTTP_PROXY"]="%(httpProxy)s" os.environ["X509_CERT_DIR"]=os.path.join(pilotWorkingDirectory, 'etc/grid-security/certificates') # TODO: structure the output print '===========================================================' print 'Environment of execution host' for key in os.environ.keys(): print key + '=' + os.environ[key] print '===========================================================' except Exception, x: print >> sys.stderr, x sys.exit(-1) cmd = "python %(pilotScript)s %(pilotOptions)s" print 'Executing: ', cmd sys.stdout.flush() os.system( cmd ) shutil.rmtree( pilotWorkingDirectory ) EOF """ % { 'compressedAndEncodedProxy': compressedAndEncodedProxy, 'compressedAndEncodedPilot': compressedAndEncodedPilot, 'compressedAndEncodedInstall': compressedAndEncodedInstall, 'httpProxy': httpProxy, 'pilotExecDir': pilotExecDir, 'pilotScript': os.path.basename( self.pilot ), 'installScript': os.path.basename( self.install ), 'pilotOptions': ' '.join( pilotOptions ), 'proxyFlag': proxyFlag } fd, name = tempfile.mkstemp( suffix = '_pilotwrapper.py', prefix = 'DIRAC_', dir = workingDirectory ) pilotWrapper = os.fdopen( fd, 'w' ) pilotWrapper.write( localPilot ) pilotWrapper.close() return name def updatePilotStatus( self ): """ Update status of pilots in transient states """ for queue in self.queueDict: ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] queueName = self.queueDict[queue]['QueueName'] ceType = self.queueDict[queue]['CEType'] siteName = self.queueDict[queue]['Site'] result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName, 'Queue':queueName, 'GridType':ceType, 'GridSite':siteName, 'Status':TRANSIENT_PILOT_STATUS, 'OwnerDN': self.pilotDN, 'OwnerGroup': self.pilotGroup } ) if not result['OK']: self.log.error( 'Failed to select pilots: %s' % result['Message'] ) continue pilotRefs = result['Value'] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo( pilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots info from DB', result['Message'] ) continue pilotDict = result['Value'] stampedPilotRefs = [] for pRef in pilotDict: if pilotDict[pRef]['PilotStamp']: stampedPilotRefs.append( pRef + ":::" + pilotDict[pRef]['PilotStamp'] ) else: stampedPilotRefs = list( pilotRefs ) break result = ce.isProxyValid() if not result['OK']: result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 600 ) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy( self.proxy, 500 ) result = ce.getJobStatus( stampedPilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots status from CE', '%s: %s' % ( ceName, result['Message'] ) ) continue pilotCEDict = result['Value'] for pRef in pilotRefs: newStatus = '' oldStatus = pilotDict[pRef]['Status'] ceStatus = pilotCEDict[pRef] if oldStatus == ceStatus: # Status did not change, continue continue elif ceStatus == "Unknown" and not oldStatus in FINAL_PILOT_STATUS: # Pilot finished without reporting, consider it Aborted newStatus = 'Aborted' elif ceStatus != 'Unknown' : # Update the pilot status to the new value newStatus = ceStatus if newStatus: self.log.info( 'Updating status to %s for pilot %s' % ( newStatus, pRef ) ) result = pilotAgentsDB.setPilotStatus( pRef, newStatus, '', 'Updated by SiteDirector' ) # Retrieve the pilot output now if newStatus in FINAL_PILOT_STATUS: if pilotDict[pRef]['OutputReady'].lower() == 'false' and self.getOutput: self.log.info( 'Retrieving output for pilot %s' % pRef ) pilotStamp = pilotDict[pRef]['PilotStamp'] pRefStamp = pRef if pilotStamp: pRefStamp = pRef + ':::' + pilotStamp result = ce.getJobOutput( pRefStamp ) if not result['OK']: self.log.error( 'Failed to get pilot output', '%s: %s' % ( ceName, result['Message'] ) ) else: output, error = result['Value'] if output: result = pilotAgentsDB.storePilotOutput( pRef, output, error ) if not result['OK']: self.log.error( 'Failed to store pilot output', result['Message'] ) else: self.log.warn( 'Empty pilot output not stored to PilotDB' ) # The pilot can be in Done state set by the job agent check if the output is retrieved for queue in self.queueDict: ce = self.queueDict[queue]['CE'] if not ce.isProxyValid( 120 ): result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 1000 ) if not result['OK']: return result ce.setProxy( self.proxy, 940 ) ceName = self.queueDict[queue]['CEName'] queueName = self.queueDict[queue]['QueueName'] ceType = self.queueDict[queue]['CEType'] siteName = self.queueDict[queue]['Site'] result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName, 'Queue':queueName, 'GridType':ceType, 'GridSite':siteName, 'OutputReady':'False', 'Status':FINAL_PILOT_STATUS} ) if not result['OK']: self.log.error( 'Failed to select pilots', result['Message'] ) continue pilotRefs = result['Value'] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo( pilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots info from DB', result['Message'] ) continue pilotDict = result['Value'] if self.getOutput: for pRef in pilotRefs: self.log.info( 'Retrieving output for pilot %s' % pRef ) pilotStamp = pilotDict[pRef]['PilotStamp'] pRefStamp = pRef if pilotStamp: pRefStamp = pRef + ':::' + pilotStamp result = ce.getJobOutput( pRefStamp ) if not result['OK']: self.log.error( 'Failed to get pilot output', '%s: %s' % ( ceName, result['Message'] ) ) else: output, error = result['Value'] result = pilotAgentsDB.storePilotOutput( pRef, output, error ) if not result['OK']: self.log.error( 'Failed to store pilot output', result['Message'] ) # Check if the accounting is to be sent if self.sendAccounting: result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName, 'Queue':queueName, 'GridType':ceType, 'GridSite':siteName, 'AccountingSent':'False', 'Status':FINAL_PILOT_STATUS} ) if not result['OK']: self.log.error( 'Failed to select pilots', result['Message'] ) continue pilotRefs = result['Value'] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo( pilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots info from DB', result['Message'] ) continue pilotDict = result['Value'] result = self.sendPilotAccounting( pilotDict ) if not result['OK']: self.log.error( 'Failed to send pilot agent accounting' ) return S_OK() def sendPilotAccounting( self, pilotDict ): """ Send pilot accounting record """ for pRef in pilotDict: self.log.verbose( 'Preparing accounting record for pilot %s' % pRef ) pA = PilotAccounting() pA.setEndTime( pilotDict[pRef][ 'LastUpdateTime' ] ) pA.setStartTime( pilotDict[pRef][ 'SubmissionTime' ] ) retVal = CS.getUsernameForDN( pilotDict[pRef][ 'OwnerDN' ] ) if not retVal[ 'OK' ]: userName = '******' self.log.error( "Can't determine username for dn:", pilotDict[pRef][ 'OwnerDN' ] ) else: userName = retVal[ 'Value' ] pA.setValueByKey( 'User', userName ) pA.setValueByKey( 'UserGroup', pilotDict[pRef][ 'OwnerGroup' ] ) result = getSiteForCE( pilotDict[pRef][ 'DestinationSite' ] ) if result['OK'] and result[ 'Value' ].strip(): pA.setValueByKey( 'Site', result['Value'].strip() ) else: pA.setValueByKey( 'Site', 'Unknown' ) pA.setValueByKey( 'GridCE', pilotDict[pRef][ 'DestinationSite' ] ) pA.setValueByKey( 'GridMiddleware', pilotDict[pRef][ 'GridType' ] ) pA.setValueByKey( 'GridResourceBroker', pilotDict[pRef][ 'Broker' ] ) pA.setValueByKey( 'GridStatus', pilotDict[pRef][ 'Status' ] ) if not 'Jobs' in pilotDict[pRef]: pA.setValueByKey( 'Jobs', 0 ) else: pA.setValueByKey( 'Jobs', len( pilotDict[pRef]['Jobs'] ) ) self.log.info( "Adding accounting record for pilot %s" % pilotDict[pRef][ 'PilotID' ] ) retVal = gDataStoreClient.addRegister( pA ) if not retVal[ 'OK' ]: self.log.error( 'Failed to send accounting info for pilot ', pRef ) else: # Set up AccountingSent flag result = pilotAgentsDB.setAccountingFlag( pRef ) if not result['OK']: self.log.error( 'Failed to set accounting flag for pilot ', pRef ) self.log.info( 'Committing accounting records for %d pilots' % len( pilotDict ) ) result = gDataStoreClient.commit() if result['OK']: for pRef in pilotDict: self.log.verbose( 'Setting AccountingSent flag for pilot %s' % pRef ) result = pilotAgentsDB.setAccountingFlag( pRef ) if not result['OK']: self.log.error( 'Failed to set accounting flag for pilot ', pRef ) else: return result return S_OK()
def setUp(self): self.rsClient = ResourceStatusClient() self.stClient = SiteStatus() self.stClient.rssFlag = True
def getPilotSummaryWeb(self, selectDict, sortList, startItem, maxItems): """Get summary of the pilot jobs status by CE/site in a standard structure""" allStateNames = PilotStatus.PILOT_STATES + [ "Done_Empty", "Aborted_Hour" ] paramNames = ["Site", "CE"] + allStateNames last_update = None if "LastUpdateTime" in selectDict: last_update = selectDict["LastUpdateTime"] del selectDict["LastUpdateTime"] site_select = [] if "GridSite" in selectDict: site_select = selectDict["GridSite"] if not isinstance(site_select, list): site_select = [site_select] del selectDict["GridSite"] status_select = [] if "Status" in selectDict: status_select = selectDict["Status"] if not isinstance(status_select, list): status_select = [status_select] del selectDict["Status"] expand_site = "" if "ExpandSite" in selectDict: expand_site = selectDict["ExpandSite"] site_select = [expand_site] del selectDict["ExpandSite"] # Get all the data from the database with various selections result = self.getCounters( "PilotAgents", ["GridSite", "DestinationSite", "Status"], selectDict, newer=last_update, timeStamp="LastUpdateTime", ) if not result["OK"]: return result last_update = Time.dateTime() - Time.hour selectDict["Status"] = PilotStatus.ABORTED resultHour = self.getCounters( "PilotAgents", ["GridSite", "DestinationSite", "Status"], selectDict, newer=last_update, timeStamp="LastUpdateTime", ) if not resultHour["OK"]: return resultHour last_update = Time.dateTime() - Time.day selectDict["Status"] = [PilotStatus.ABORTED, PilotStatus.DONE] resultDay = self.getCounters( "PilotAgents", ["GridSite", "DestinationSite", "Status"], selectDict, newer=last_update, timeStamp="LastUpdateTime", ) if not resultDay["OK"]: return resultDay selectDict["CurrentJobID"] = 0 selectDict["Status"] = PilotStatus.DONE resultDayEmpty = self.getCounters( "PilotAgents", ["GridSite", "DestinationSite", "Status"], selectDict, newer=last_update, timeStamp="LastUpdateTime", ) if not resultDayEmpty["OK"]: return resultDayEmpty ceMap = {} resMap = getCESiteMapping() if resMap["OK"]: ceMap = resMap["Value"] # Sort out different counters resultDict = {} resultDict["Unknown"] = {} for attDict, count in result["Value"]: site = attDict["GridSite"] ce = attDict["DestinationSite"] state = attDict["Status"] if site == "Unknown" and ce != "Unknown" and ce != "Multiple" and ce in ceMap: site = ceMap[ce] if site not in resultDict: resultDict[site] = {} if ce not in resultDict[site]: resultDict[site][ce] = {} for p in allStateNames: resultDict[site][ce][p] = 0 resultDict[site][ce][state] = count for attDict, count in resultDay["Value"]: site = attDict["GridSite"] ce = attDict["DestinationSite"] state = attDict["Status"] if site == "Unknown" and ce != "Unknown" and ce in ceMap: site = ceMap[ce] if state == PilotStatus.DONE: resultDict[site][ce][PilotStatus.DONE] = count if state == PilotStatus.ABORTED: resultDict[site][ce][PilotStatus.ABORTED] = count for attDict, count in resultDayEmpty["Value"]: site = attDict["GridSite"] ce = attDict["DestinationSite"] state = attDict["Status"] if site == "Unknown" and ce != "Unknown" and ce in ceMap: site = ceMap[ce] if state == PilotStatus.DONE: resultDict[site][ce]["Done_Empty"] = count for attDict, count in resultHour["Value"]: site = attDict["GridSite"] ce = attDict["DestinationSite"] state = attDict["Status"] if site == "Unknown" and ce != "Unknown" and ce in ceMap: site = ceMap[ce] if state == PilotStatus.ABORTED: resultDict[site][ce]["Aborted_Hour"] = count records = [] siteSumDict = {} for site in resultDict: sumDict = {} for state in allStateNames: if state not in sumDict: sumDict[state] = 0 sumDict["Total"] = 0 for ce in resultDict[site]: itemList = [site, ce] total = 0 for state in allStateNames: itemList.append(resultDict[site][ce][state]) sumDict[state] += resultDict[site][ce][state] if state == PilotStatus.DONE: done = resultDict[site][ce][state] if state == "Done_Empty": empty = resultDict[site][ce][state] if state == PilotStatus.ABORTED: aborted = resultDict[site][ce][state] if state != "Aborted_Hour" and state != "Done_Empty": total += resultDict[site][ce][state] sumDict["Total"] += total # Add the total number of pilots seen in the last day itemList.append(total) # Add pilot submission efficiency evaluation if (done - empty) > 0: eff = done / (done - empty) elif done == 0: eff = 0.0 elif empty == done: eff = 99.0 else: eff = 0.0 itemList.append("%.2f" % eff) # Add pilot job efficiency evaluation if total > 0: eff = (total - aborted) / total * 100 else: eff = 100.0 itemList.append("%.2f" % eff) # Evaluate the quality status of the CE if total > 10: if eff < 25.0: itemList.append("Bad") elif eff < 60.0: itemList.append("Poor") elif eff < 85.0: itemList.append("Fair") else: itemList.append("Good") else: itemList.append("Idle") if len(resultDict[site]) == 1 or expand_site: records.append(itemList) if len(resultDict[site]) > 1 and not expand_site: itemList = [site, "Multiple"] for state in allStateNames + ["Total"]: if state in sumDict: itemList.append(sumDict[state]) else: itemList.append(0) done = sumDict[PilotStatus.DONE] empty = sumDict["Done_Empty"] aborted = sumDict[PilotStatus.ABORTED] total = sumDict["Total"] # Add pilot submission efficiency evaluation if (done - empty) > 0: eff = done / (done - empty) elif done == 0: eff = 0.0 elif empty == done: eff = 99.0 else: eff = 0.0 itemList.append("%.2f" % eff) # Add pilot job efficiency evaluation if total > 0: eff = (total - aborted) / total * 100 else: eff = 100.0 itemList.append("%.2f" % eff) # Evaluate the quality status of the Site if total > 10: if eff < 25.0: itemList.append("Bad") elif eff < 60.0: itemList.append("Poor") elif eff < 85.0: itemList.append("Fair") else: itemList.append("Good") else: itemList.append("Idle") records.append(itemList) for state in allStateNames + ["Total"]: if state not in siteSumDict: siteSumDict[state] = sumDict[state] else: siteSumDict[state] += sumDict[state] # Perform site selection if site_select: new_records = [] for r in records: if r[0] in site_select: new_records.append(r) records = new_records # Perform status selection if status_select: new_records = [] for r in records: if r[14] in status_select: new_records.append(r) records = new_records # Get the Site Mask data result = SiteStatus().getUsableSites() if result["OK"]: siteMask = result["Value"] for r in records: if r[0] in siteMask: r.append("Yes") else: r.append("No") else: for r in records: r.append("Unknown") finalDict = {} finalDict["TotalRecords"] = len(records) finalDict["ParameterNames"] = paramNames + [ "Total", "PilotsPerJob", "PilotJobEff", "Status", "InMask" ] # Return all the records if maxItems == 0 or the specified number otherwise if maxItems: finalDict["Records"] = records[startItem:startItem + maxItems] else: finalDict["Records"] = records done = siteSumDict[PilotStatus.DONE] empty = siteSumDict["Done_Empty"] aborted = siteSumDict[PilotStatus.ABORTED] total = siteSumDict["Total"] # Add pilot submission efficiency evaluation if (done - empty) > 0: eff = done / (done - empty) elif done == 0: eff = 0.0 elif empty == done: eff = 99.0 else: eff = 0.0 siteSumDict["PilotsPerJob"] = "%.2f" % eff # Add pilot job efficiency evaluation if total > 0: eff = (total - aborted) / total * 100 else: eff = 100.0 siteSumDict["PilotJobEff"] = "%.2f" % eff # Evaluate the overall quality status if total > 100: if eff < 25.0: siteSumDict["Status"] = "Bad" elif eff < 60.0: siteSumDict["Status"] = "Poor" elif eff < 85.0: siteSumDict["Status"] = "Fair" else: siteSumDict["Status"] = "Good" else: siteSumDict["Status"] = "Idle" finalDict["Extras"] = siteSumDict return S_OK(finalDict)
class PilotDirector( object ): """ Base Pilot Director class. Derived classes must implement: * __init__( self, submitPool ): that must call the parent class __init__ method and then do its own initialization * configure( self, csSection, submitPool ): that must call the parent class configure method and the do its own configuration * _submitPilot( self, workDir, taskQueueDict, pilotOptions, pilotsToSubmit, ceMask, submitPrivatePilot, privateTQ, proxy, pilotsPerJob ) * _listMatch( self, proxy, jdl, taskQueueID, rb ) * _getChildrenReferences( self, proxy, parentReference, taskQueueID ) Derived classes might implement: * configureFromSection( self, mySection ): to reload from a CS section the additional datamembers they might have defined. If additional datamembers are defined, they must: - be declared in the __init__ - be reconfigured in the configureFromSection method by executing self.reloadConfiguration( csSection, submitPool ) in their configure method """ gridMiddleware = '' def __init__( self, submitPool ): """ Define the logger and some defaults """ if submitPool == self.gridMiddleware: self.log = gLogger.getSubLogger( '%sPilotDirector' % self.gridMiddleware ) else: self.log = gLogger.getSubLogger( '%sPilotDirector/%s' % ( self.gridMiddleware, submitPool ) ) self.pilot = DIRAC_PILOT self.submitPoolOption = '-o /Resources/Computing/CEDefaults/SubmitPool=%s' % submitPool self.extraPilotOptions = [] self.installVersion = DIRAC_VERSION self.installProject = DIRAC_PROJECT self.installation = DIRAC_INSTALLATION self.pilotExtensionsList = [] self.virtualOrganization = VIRTUAL_ORGANIZATION self.install = DIRAC_INSTALL self.extraModules = DIRAC_MODULES self.maxJobsInFillMode = MAX_JOBS_IN_FILLMODE self.targetGrids = [ self.gridMiddleware ] self.enableListMatch = ENABLE_LISTMATCH self.listMatchDelay = LISTMATCH_DELAY self.listMatchCache = DictCache() self.privatePilotFraction = PRIVATE_PILOT_FRACTION self.errorClearTime = ERROR_CLEAR_TIME self.errorTicketTime = ERROR_TICKET_TIME self.errorMailAddress = DIRAC.errorMail self.alarmMailAddress = DIRAC.alarmMail self.mailFromAddress = FROM_MAIL self.siteClient = SiteStatus() if not 'log' in self.__dict__: self.log = gLogger.getSubLogger( 'PilotDirector' ) self.log.info( 'Initialized' ) def configure( self, csSection, submitPool ): """ Here goes common configuration for all PilotDirectors """ self.configureFromSection( csSection ) self.reloadConfiguration( csSection, submitPool ) # Get the defaults for the Setup where the Director is running opsHelper = Operations() self.installVersion = opsHelper.getValue( cfgPath( 'Pilot', 'Version' ), [ self.installVersion ] )[0] self.installProject = opsHelper.getValue( cfgPath( 'Pilot', 'Project' ), self.installProject ) self.installation = opsHelper.getValue( cfgPath( 'Pilot', 'Installation' ), self.installation ) self.pilotExtensionsList = opsHelper.getValue( "Pilot/Extensions", self.pilotExtensionsList ) self.log.info( '===============================================' ) self.log.info( 'Configuration:' ) self.log.info( '' ) self.log.info( ' Target Grids: ', ', '.join( self.targetGrids ) ) self.log.info( ' Install script: ', self.install ) self.log.info( ' Pilot script: ', self.pilot ) self.log.info( ' Pilot modules', self.extraModules ) self.log.info( ' Install Ver: ', self.installVersion ) if self.installProject: self.log.info( ' Project: ', self.installProject ) if self.installation: self.log.info( ' Installation: ', self.installation ) if self.extraPilotOptions: self.log.info( ' Extra Options: ', ' '.join( self.extraPilotOptions ) ) self.log.info( ' ListMatch: ', self.enableListMatch ) self.log.info( ' Private %: ', self.privatePilotFraction * 100 ) if self.enableListMatch: self.log.info( ' ListMatch Delay:', self.listMatchDelay ) self.listMatchCache.purgeExpired() def reloadConfiguration( self, csSection, submitPool ): """ Common Configuration can be overwriten for each GridMiddleware """ mySection = csSection + '/' + self.gridMiddleware self.configureFromSection( mySection ) # And Again for each SubmitPool mySection = csSection + '/' + submitPool self.configureFromSection( mySection ) def configureFromSection( self, mySection ): """ reload from CS """ self.pilot = gConfig.getValue( mySection + '/PilotScript' , self.pilot ) self.installVersion = gConfig.getValue( mySection + '/Version' , self.installVersion ) self.extraPilotOptions = gConfig.getValue( mySection + '/ExtraPilotOptions' , self.extraPilotOptions ) self.install = gConfig.getValue( mySection + '/InstallScript' , self.install ) self.extraModules = gConfig.getValue( mySection + '/ExtraPilotModules' , [] ) + self.extraModules self.installProject = gConfig.getValue( mySection + '/Project' , self.installProject ) self.installation = gConfig.getValue( mySection + '/Installation' , self.installation ) self.maxJobsInFillMode = gConfig.getValue( mySection + '/MaxJobsInFillMode' , self.maxJobsInFillMode ) self.targetGrids = gConfig.getValue( mySection + '/TargetGrids' , self.targetGrids ) self.enableListMatch = gConfig.getValue( mySection + '/EnableListMatch' , self.enableListMatch ) self.listMatchDelay = gConfig.getValue( mySection + '/ListMatchDelay' , self.listMatchDelay ) self.errorClearTime = gConfig.getValue( mySection + '/ErrorClearTime' , self.errorClearTime ) self.errorTicketTime = gConfig.getValue( mySection + '/ErrorTicketTime' , self.errorTicketTime ) self.errorMailAddress = gConfig.getValue( mySection + '/ErrorMailAddress' , self.errorMailAddress ) self.alarmMailAddress = gConfig.getValue( mySection + '/AlarmMailAddress' , self.alarmMailAddress ) self.mailFromAddress = gConfig.getValue( mySection + '/MailFromAddress' , self.mailFromAddress ) self.privatePilotFraction = gConfig.getValue( mySection + '/PrivatePilotFraction' , self.privatePilotFraction ) virtualOrganization = gConfig.getValue( mySection + '/VirtualOrganization' , '' ) if not virtualOrganization: virtualOrganization = getVOForGroup( 'NonExistingGroup' ) if not virtualOrganization: virtualOrganization = self.virtualOrganization self.virtualOrganization = virtualOrganization def _resolveCECandidates( self, taskQueueDict ): """ Return a list of CEs for this TaskQueue """ # assume user knows what they're doing and avoid site mask e.g. sam jobs if 'GridCEs' in taskQueueDict and taskQueueDict['GridCEs']: self.log.info( 'CEs requested by TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join( taskQueueDict['GridCEs'] ) ) return taskQueueDict['GridCEs'] # Get the mask ret = self.siteClient.getSites() if not ret['OK']: self.log.error( 'Can not retrieve site Mask from DB:', ret['Message'] ) return [] siteMask = ret['Value'] if not siteMask: self.log.error( 'Site mask is empty' ) return [] self.log.verbose( 'Site Mask: %s' % ', '.join( siteMask ) ) # remove banned sites from siteMask if 'BannedSites' in taskQueueDict: for site in taskQueueDict['BannedSites']: if site in siteMask: siteMask.remove( site ) self.log.verbose( 'Removing banned site %s from site Mask' % site ) # remove from the mask if a Site is given siteMask = [ site for site in siteMask if 'Sites' not in taskQueueDict or site in taskQueueDict['Sites'] ] if not siteMask: # pilot can not be submitted self.log.info( 'No Valid Site Candidate in Mask for TaskQueue %s' % taskQueueDict['TaskQueueID'] ) return [] self.log.info( 'Site Candidates for TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join( siteMask ) ) # Get CE's associates to the given site Names ceMask = [] for grid in self.targetGrids: section = '/Resources/Sites/%s' % grid ret = gConfig.getSections( section ) if not ret['OK']: # this is hack, maintained until LCG is added as TargetGrid for the gLite SubmitPool section = '/Resources/Sites/LCG' ret = gConfig.getSections( section ) if not ret['OK']: self.log.error( 'Could not obtain CEs from CS', ret['Message'] ) continue gridSites = ret['Value'] for siteName in gridSites: if siteName in siteMask: ret = gConfig.getValue( '%s/%s/CE' % ( section, siteName ), [] ) for ce in ret: submissionMode = gConfig.getValue( '%s/%s/CEs/%s/SubmissionMode' % ( section, siteName, ce ), 'gLite' ) if submissionMode == self.gridMiddleware and ce not in ceMask: ceMask.append( ce ) if not ceMask: self.log.info( 'No CE Candidate found for TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join( siteMask ) ) self.log.verbose( 'CE Candidates for TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join( ceMask ) ) return ceMask def _getPilotOptions( self, taskQueueDict, pilotsToSubmit ): # Need to limit the maximum number of pilots to submit at once # For generic pilots this is limited by the number of use of the tokens and the # maximum number of jobs in Filling mode, but for private Jobs we need an extra limitation: pilotsToSubmit = max( min( pilotsToSubmit, int( 50 / self.maxJobsInFillMode ) ), 1 ) pilotOptions = [] privateIfGenericTQ = self.privatePilotFraction > random.random() privateTQ = ( 'PilotTypes' in taskQueueDict and 'private' in [ t.lower() for t in taskQueueDict['PilotTypes'] ] ) forceGeneric = 'ForceGeneric' in taskQueueDict submitPrivatePilot = ( privateIfGenericTQ or privateTQ ) and not forceGeneric if submitPrivatePilot: self.log.verbose( 'Submitting private pilots for TaskQueue %s' % taskQueueDict['TaskQueueID'] ) ownerDN = taskQueueDict['OwnerDN'] ownerGroup = taskQueueDict['OwnerGroup'] # User Group requirement pilotOptions.append( '-G %s' % taskQueueDict['OwnerGroup'] ) # check if group allows jobsharing ownerGroupProperties = getPropertiesForGroup( ownerGroup ) if not 'JobSharing' in ownerGroupProperties: # Add Owner requirement to pilot pilotOptions.append( "-O '%s'" % ownerDN ) if privateTQ: pilotOptions.append( '-o /Resources/Computing/CEDefaults/PilotType=private' ) maxJobsInFillMode = self.maxJobsInFillMode else: #For generic jobs we'll submit mixture of generic and private pilots self.log.verbose( 'Submitting generic pilots for TaskQueue %s' % taskQueueDict['TaskQueueID'] ) #ADRI: Find the generic group result = findGenericPilotCredentials( group = taskQueueDict[ 'OwnerGroup' ] ) if not result[ 'OK' ]: self.log.error( ERROR_GENERIC_CREDENTIALS, result[ 'Message' ] ) return S_ERROR( ERROR_GENERIC_CREDENTIALS ) ownerDN, ownerGroup = result[ 'Value' ] result = gProxyManager.requestToken( ownerDN, ownerGroup, max( pilotsToSubmit, self.maxJobsInFillMode ) ) if not result[ 'OK' ]: self.log.error( ERROR_TOKEN, result['Message'] ) return S_ERROR( ERROR_TOKEN ) ( token, numberOfUses ) = result[ 'Value' ] pilotsToSubmit = min( numberOfUses, pilotsToSubmit ) pilotOptions.append( '-o /Security/ProxyToken=%s' % token ) pilotsToSubmit = max( 1, ( pilotsToSubmit - 1 ) / self.maxJobsInFillMode + 1 ) maxJobsInFillMode = int( numberOfUses / pilotsToSubmit ) # Use Filling mode pilotOptions.append( '-M %s' % maxJobsInFillMode ) # Debug pilotOptions.append( '-d' ) # Setup. pilotOptions.append( '-S %s' % taskQueueDict['Setup'] ) # CS Servers csServers = gConfig.getServersList() if len( csServers ) > 3: # Remove the master master = gConfigurationData.getMasterServer() if master in csServers: csServers.remove( master ) pilotOptions.append( '-C %s' % ",".join( csServers ) ) # DIRAC Extensions to be used in pilots # ubeda: I'm not entirely sure if we can use here the same opsHelper as in line # line +352 pilotExtensionsList = Operations().getValue( "Pilot/Extensions", [] ) extensionsList = [] if pilotExtensionsList: if pilotExtensionsList[0] != 'None': extensionsList = pilotExtensionsList else: extensionsList = getCSExtensions() if extensionsList: pilotOptions.append( '-e %s' % ",".join( extensionsList ) ) #Get DIRAC version and project, There might be global Setup defaults and per VO/Setup defaults (from configure) opsHelper = Operations( group = taskQueueDict['OwnerGroup'], setup = taskQueueDict['Setup'] ) # Requested version of DIRAC (it can be a list, so we take the fist one) version = opsHelper.getValue( cfgPath( 'Pilot', 'Version' ) , [ self.installVersion ] )[0] pilotOptions.append( '-r %s' % version ) # Requested Project to install installProject = opsHelper.getValue( cfgPath( 'Pilot', 'Project' ) , self.installProject ) if installProject: pilotOptions.append( '-l %s' % installProject ) installation = opsHelper.getValue( cfgPath( 'Pilot', 'Installation' ), self.installation ) if installation: pilotOptions.append( "-V %s" % installation ) # Requested CPU time pilotOptions.append( '-T %s' % taskQueueDict['CPUTime'] ) if self.submitPoolOption not in self.extraPilotOptions: pilotOptions.append( self.submitPoolOption ) if self.extraPilotOptions: pilotOptions.extend( self.extraPilotOptions ) return S_OK( ( pilotOptions, pilotsToSubmit, ownerDN, ownerGroup, submitPrivatePilot, privateTQ ) ) def _submitPilots( self, workDir, taskQueueDict, pilotOptions, pilotsToSubmit, ceMask, submitPrivatePilot, privateTQ, proxy, pilotsPerJob ): """ This method must be implemented on the Backend specific derived class. This is problem with the Director, not with the Job so we must return S_OK Return S_ERROR if not defined. """ self.log.error( '_submitPilots method not implemented' ) return S_OK() def _submitPilot( self, proxy, pilotsToSubmit, jdl, taskQueueID, rb ): """ Submit pilot and get back the reference """ self.log.error( '_submitPilot method not implemented' ) return S_OK() def _listMatch( self, proxy, jdl, taskQueueID, rb ): """ This method must be implemented on the Backend specific derived class. """ self.log.error( '_listMatch method not implemented' ) return S_OK() def _getChildrenReferences( self, proxy, parentReference, taskQueueID ): """ This method must be implemented on the Backend specific derived class. """ self.log.error( '_getChildrenReferences method not implemented' ) return S_OK() def submitPilots( self, taskQueueDict, pilotsToSubmit, workDir = None ): """ Submit pilot for the given TaskQueue, this method just insert the request in the corresponding ThreadPool, the submission is done from the Thread Pool job """ try: taskQueueID = taskQueueDict['TaskQueueID'] self.log.verbose( 'Submitting Pilot' ) ceMask = self._resolveCECandidates( taskQueueDict ) if not ceMask: return S_ERROR( 'No CE available for TaskQueue %d' % int( taskQueueID ) ) result = self._getPilotOptions( taskQueueDict, pilotsToSubmit ) if not result['OK']: return result ( pilotOptions, pilotsPerJob, ownerDN, ownerGroup, submitPrivatePilot, privateTQ ) = result['Value'] # get a valid proxy, submit with a long proxy to avoid renewal ret = self._getPilotProxyFromDIRACGroup( ownerDN, ownerGroup, requiredTimeLeft = 86400 * 5 ) if not ret['OK']: self.log.error( ret['Message'] ) self.log.error( 'No proxy Available', 'User "%s", Group "%s"' % ( ownerDN, ownerGroup ) ) return S_ERROR( ERROR_PROXY ) proxy = ret['Value'] # Now call a Grid Specific method to handle the final submission of the pilots return self._submitPilots( workDir, taskQueueDict, pilotOptions, pilotsToSubmit, ceMask, submitPrivatePilot, privateTQ, proxy, pilotsPerJob ) except Exception: self.log.exception( 'Error in Pilot Submission' ) return S_OK( 0 ) def _getPilotProxyFromDIRACGroup( self, ownerDN, ownerGroup, requiredTimeLeft ): """ To be overwritten if a given Pilot does not require a full proxy """ self.log.info( "Downloading %s@%s proxy" % ( ownerDN, ownerGroup ) ) return gProxyManager.getPilotProxyFromDIRACGroup( ownerDN, ownerGroup, requiredTimeLeft ) def exceptionCallBack( self, threadedJob, exceptionInfo ): self.log.exception( 'Error in Pilot Submission' )
def getPilotSummaryWeb(self, selectDict, sortList, startItem, maxItems): """ Get summary of the pilot jobs status by CE/site in a standard structure """ stateNames = [ 'Submitted', 'Ready', 'Scheduled', 'Waiting', 'Running', 'Done', 'Aborted', 'Failed' ] allStateNames = stateNames + ['Done_Empty', 'Aborted_Hour'] paramNames = ['Site', 'CE'] + allStateNames last_update = None if 'LastUpdateTime' in selectDict: last_update = selectDict['LastUpdateTime'] del selectDict['LastUpdateTime'] site_select = [] if 'GridSite' in selectDict: site_select = selectDict['GridSite'] if not isinstance(site_select, list): site_select = [site_select] del selectDict['GridSite'] status_select = [] if 'Status' in selectDict: status_select = selectDict['Status'] if not isinstance(status_select, list): status_select = [status_select] del selectDict['Status'] expand_site = '' if 'ExpandSite' in selectDict: expand_site = selectDict['ExpandSite'] site_select = [expand_site] del selectDict['ExpandSite'] # Get all the data from the database with various selections result = self.getCounters('PilotAgents', ['GridSite', 'DestinationSite', 'Status'], selectDict, newer=last_update, timeStamp='LastUpdateTime') if not result['OK']: return result last_update = Time.dateTime() - Time.hour selectDict['Status'] = 'Aborted' resultHour = self.getCounters( 'PilotAgents', ['GridSite', 'DestinationSite', 'Status'], selectDict, newer=last_update, timeStamp='LastUpdateTime') if not resultHour['OK']: return resultHour last_update = Time.dateTime() - Time.day selectDict['Status'] = ['Aborted', 'Done'] resultDay = self.getCounters('PilotAgents', ['GridSite', 'DestinationSite', 'Status'], selectDict, newer=last_update, timeStamp='LastUpdateTime') if not resultDay['OK']: return resultDay selectDict['CurrentJobID'] = 0 selectDict['Status'] = 'Done' resultDayEmpty = self.getCounters( 'PilotAgents', ['GridSite', 'DestinationSite', 'Status'], selectDict, newer=last_update, timeStamp='LastUpdateTime') if not resultDayEmpty['OK']: return resultDayEmpty ceMap = {} resMap = getCESiteMapping() if resMap['OK']: ceMap = resMap['Value'] # Sort out different counters resultDict = {} resultDict['Unknown'] = {} for attDict, count in result['Value']: site = attDict['GridSite'] ce = attDict['DestinationSite'] state = attDict['Status'] if site == 'Unknown' and ce != "Unknown" and ce != "Multiple" and ce in ceMap: site = ceMap[ce] if site not in resultDict: resultDict[site] = {} if ce not in resultDict[site]: resultDict[site][ce] = {} for p in allStateNames: resultDict[site][ce][p] = 0 resultDict[site][ce][state] = count for attDict, count in resultDay['Value']: site = attDict['GridSite'] ce = attDict['DestinationSite'] state = attDict['Status'] if site == 'Unknown' and ce != "Unknown" and ce in ceMap: site = ceMap[ce] if state == "Done": resultDict[site][ce]["Done"] = count if state == "Aborted": resultDict[site][ce]["Aborted"] = count for attDict, count in resultDayEmpty['Value']: site = attDict['GridSite'] ce = attDict['DestinationSite'] state = attDict['Status'] if site == 'Unknown' and ce != "Unknown" and ce in ceMap: site = ceMap[ce] if state == "Done": resultDict[site][ce]["Done_Empty"] = count for attDict, count in resultHour['Value']: site = attDict['GridSite'] ce = attDict['DestinationSite'] state = attDict['Status'] if site == 'Unknown' and ce != "Unknown" and ce in ceMap: site = ceMap[ce] if state == "Aborted": resultDict[site][ce]["Aborted_Hour"] = count records = [] siteSumDict = {} for site in resultDict: sumDict = {} for state in allStateNames: if state not in sumDict: sumDict[state] = 0 sumDict['Total'] = 0 for ce in resultDict[site]: itemList = [site, ce] total = 0 for state in allStateNames: itemList.append(resultDict[site][ce][state]) sumDict[state] += resultDict[site][ce][state] if state == "Done": done = resultDict[site][ce][state] if state == "Done_Empty": empty = resultDict[site][ce][state] if state == "Aborted": aborted = resultDict[site][ce][state] if state != "Aborted_Hour" and state != "Done_Empty": total += resultDict[site][ce][state] sumDict['Total'] += total # Add the total number of pilots seen in the last day itemList.append(total) # Add pilot submission efficiency evaluation if (done - empty) > 0: eff = done / (done - empty) elif done == 0: eff = 0. elif empty == done: eff = 99. else: eff = 0. itemList.append('%.2f' % eff) # Add pilot job efficiency evaluation if total > 0: eff = (total - aborted) / total * 100 else: eff = 100. itemList.append('%.2f' % eff) # Evaluate the quality status of the CE if total > 10: if eff < 25.: itemList.append('Bad') elif eff < 60.: itemList.append('Poor') elif eff < 85.: itemList.append('Fair') else: itemList.append('Good') else: itemList.append('Idle') if len(resultDict[site]) == 1 or expand_site: records.append(itemList) if len(resultDict[site]) > 1 and not expand_site: itemList = [site, 'Multiple'] for state in allStateNames + ['Total']: if state in sumDict: itemList.append(sumDict[state]) else: itemList.append(0) done = sumDict["Done"] empty = sumDict["Done_Empty"] aborted = sumDict["Aborted"] total = sumDict["Total"] # Add pilot submission efficiency evaluation if (done - empty) > 0: eff = done / (done - empty) elif done == 0: eff = 0. elif empty == done: eff = 99. else: eff = 0. itemList.append('%.2f' % eff) # Add pilot job efficiency evaluation if total > 0: eff = (total - aborted) / total * 100 else: eff = 100. itemList.append('%.2f' % eff) # Evaluate the quality status of the Site if total > 10: if eff < 25.: itemList.append('Bad') elif eff < 60.: itemList.append('Poor') elif eff < 85.: itemList.append('Fair') else: itemList.append('Good') else: itemList.append('Idle') records.append(itemList) for state in allStateNames + ['Total']: if state not in siteSumDict: siteSumDict[state] = sumDict[state] else: siteSumDict[state] += sumDict[state] # Perform site selection if site_select: new_records = [] for r in records: if r[0] in site_select: new_records.append(r) records = new_records # Perform status selection if status_select: new_records = [] for r in records: if r[14] in status_select: new_records.append(r) records = new_records # Get the Site Mask data result = SiteStatus().getUsableSites() if result['OK']: siteMask = result['Value'] for r in records: if r[0] in siteMask: r.append('Yes') else: r.append('No') else: for r in records: r.append('Unknown') finalDict = {} finalDict['TotalRecords'] = len(records) finalDict['ParameterNames'] = paramNames + \ ['Total', 'PilotsPerJob', 'PilotJobEff', 'Status', 'InMask'] # Return all the records if maxItems == 0 or the specified number otherwise if maxItems: finalDict['Records'] = records[startItem:startItem + maxItems] else: finalDict['Records'] = records done = siteSumDict["Done"] empty = siteSumDict["Done_Empty"] aborted = siteSumDict["Aborted"] total = siteSumDict["Total"] # Add pilot submission efficiency evaluation if (done - empty) > 0: eff = done / (done - empty) elif done == 0: eff = 0. elif empty == done: eff = 99. else: eff = 0. siteSumDict['PilotsPerJob'] = '%.2f' % eff # Add pilot job efficiency evaluation if total > 0: eff = (total - aborted) / total * 100 else: eff = 100. siteSumDict['PilotJobEff'] = '%.2f' % eff # Evaluate the overall quality status if total > 100: if eff < 25.: siteSumDict['Status'] = 'Bad' elif eff < 60.: siteSumDict['Status'] = 'Poor' elif eff < 85.: siteSumDict['Status'] = 'Fair' else: siteSumDict['Status'] = 'Good' else: siteSumDict['Status'] = 'Idle' finalDict['Extras'] = siteSumDict return S_OK(finalDict)
class SiteInspectorAgent(AgentModule): """ SiteInspectorAgent The SiteInspectorAgent agent is an agent that is used to get the all the site names and trigger PEP to evaluate their status. """ # Max number of worker threads by default __maxNumberOfThreads = 15 # Inspection freqs, defaults, the lower, the higher priority to be checked. # Error state usually means there is a glitch somewhere, so it has the highest # priority. __checkingFreqs = { 'Active': 20, 'Degraded': 20, 'Probing': 20, 'Banned': 15, 'Unknown': 10, 'Error': 5 } def __init__(self, *args, **kwargs): AgentModule.__init__(self, *args, **kwargs) # ElementType, to be defined among Site, Resource or Node self.sitesToBeChecked = None self.threadPool = None self.siteClient = None self.clients = {} def initialize(self): """ Standard initialize. """ maxNumberOfThreads = self.am_getOption('maxNumberOfThreads', self.__maxNumberOfThreads) self.threadPool = ThreadPool(maxNumberOfThreads, maxNumberOfThreads) self.siteClient = SiteStatus() self.clients['SiteStatus'] = self.siteClient self.clients['ResourceManagementClient'] = ResourceManagementClient() return S_OK() def execute(self): """ execute This is the main method of the agent. It gets the sites from the Database, calculates how many threads should be started and spawns them. Each thread will get a site from the queue until it is empty. At the end, the method will join the queue such that the agent will not terminate a cycle until all sites have been processed. """ # Gets sites to be checked ( returns a Queue ) sitesToBeChecked = self.getSitesToBeChecked() if not sitesToBeChecked['OK']: self.log.error(sitesToBeChecked['Message']) return sitesToBeChecked self.sitesToBeChecked = sitesToBeChecked['Value'] queueSize = self.sitesToBeChecked.qsize() pollingTime = self.am_getPollingTime() # Assigns number of threads on the fly such that we exhaust the PollingTime # without having to spawn too many threads. We assume 10 seconds per element # to be processed ( actually, it takes something like 1 sec per element ): # numberOfThreads = elements * 10(s/element) / pollingTime numberOfThreads = int(math.ceil(queueSize * 10. / pollingTime)) self.log.info('Needed %d threads to process %d elements' % (numberOfThreads, queueSize)) for _x in xrange(numberOfThreads): jobUp = self.threadPool.generateJobAndQueueIt(self._execute) if not jobUp['OK']: self.log.error(jobUp['Message']) self.log.info('blocking until all sites have been processed') # block until all tasks are done self.sitesToBeChecked.join() self.log.info('done') return S_OK() def getSitesToBeChecked(self): """ getElementsToBeChecked This method gets all the site names from the SiteStatus table, after that it get the details of each site (status, name, etc..) and adds them to a queue. """ toBeChecked = Queue.Queue() res = self.siteClient.getSites('All') if not res['OK']: return res # get the current status res = self.siteClient.getSiteStatuses(res['Value']) if not res['OK']: return res # filter elements for site in res['Value']: status = res['Value'].get(site, 'Unknown') toBeChecked.put({ 'status': status, 'name': site, 'site': site, 'element': 'Site', 'statusType': 'all', 'elementType': 'Site' }) return S_OK(toBeChecked) # Private methods ............................................................ def _execute(self): """ Method run by each of the thread that is in the ThreadPool. It enters a loop until there are no sites on the queue. On each iteration, it evaluates the policies for such site and enforces the necessary actions. If there are no more sites in the queue, the loop is finished. """ pep = PEP(clients=self.clients) while True: try: site = self.sitesToBeChecked.get_nowait() except Queue.Empty: return S_OK() resEnforce = pep.enforce(site) if not resEnforce['OK']: self.log.error('Failed policy enforcement', resEnforce['Message']) self.sitesToBeChecked.task_done() continue # Used together with join ! self.sitesToBeChecked.task_done()
def initialize(self): self.siteClient = SiteStatus() return S_OK()
class DiracAdmin(API): """ Administrative functionalities """ ############################################################################# def __init__(self): """Internal initialization of the DIRAC Admin API. """ super(DiracAdmin, self).__init__() self.csAPI = CSAPI() self.dbg = False if gConfig.getValue(self.section + '/LogLevel', 'DEBUG') == 'DEBUG': self.dbg = True self.scratchDir = gConfig.getValue(self.section + '/ScratchDir', '/tmp') self.currentDir = os.getcwd() self.rssFlag = ResourceStatus().rssFlag self.sitestatus = SiteStatus() self._siteSet = set(getSites().get('Value', [])) ############################################################################# def uploadProxy(self): """Upload a proxy to the DIRAC WMS. This method Example usage: >>> print diracAdmin.uploadProxy('dteam_pilot') {'OK': True, 'Value': 0L} :return: S_OK,S_ERROR :param permanent: Indefinitely update proxy :type permanent: boolean """ return gProxyManager.uploadProxy() ############################################################################# def setProxyPersistency(self, userDN, userGroup, persistent=True): """Set the persistence of a proxy in the Proxy Manager Example usage: >>> gLogger.notice(diracAdmin.setProxyPersistency( 'some DN', 'dirac group', True )) {'OK': True } :param userDN: User DN :type userDN: string :param userGroup: DIRAC Group :type userGroup: string :param persistent: Persistent flag :type persistent: boolean :return: S_OK,S_ERROR """ return gProxyManager.setPersistency(userDN, userGroup, persistent) ############################################################################# def checkProxyUploaded(self, userDN, userGroup, requiredTime): """Set the persistence of a proxy in the Proxy Manager Example usage: >>> gLogger.notice(diracAdmin.setProxyPersistency( 'some DN', 'dirac group', True )) {'OK': True, 'Value' : True/False } :param userDN: User DN :type userDN: string :param userGroup: DIRAC Group :type userGroup: string :param requiredTime: Required life time of the uploaded proxy :type requiredTime: boolean :return: S_OK,S_ERROR """ return gProxyManager.userHasProxy(userDN, userGroup, requiredTime) ############################################################################# def getSiteMask(self, printOutput=False, status='Active'): """Retrieve current site mask from WMS Administrator service. Example usage: >>> gLogger.notice(diracAdmin.getSiteMask()) {'OK': True, 'Value': 0L} :return: S_OK,S_ERROR """ result = self.sitestatus.getSites(siteState=status) if result['OK']: sites = result['Value'] if printOutput: sites.sort() for site in sites: gLogger.notice(site) return result ############################################################################# def getBannedSites(self, printOutput=False): """Retrieve current list of banned and probing sites. Example usage: >>> gLogger.notice(diracAdmin.getBannedSites()) {'OK': True, 'Value': []} :return: S_OK,S_ERROR """ bannedSites = self.sitestatus.getSites(siteState='Banned') if not bannedSites['OK']: return bannedSites probingSites = self.sitestatus.getSites(siteState='Probing') if not probingSites['OK']: return probingSites mergedList = sorted(bannedSites['Value'] + probingSites['Value']) if printOutput: gLogger.notice('\n'.join(mergedList)) return S_OK(mergedList) ############################################################################# def getSiteSection(self, site, printOutput=False): """Simple utility to get the list of CEs for DIRAC site name. Example usage: >>> gLogger.notice(diracAdmin.getSiteSection('LCG.CERN.ch')) {'OK': True, 'Value':} :return: S_OK,S_ERROR """ gridType = site.split('.')[0] if not gConfig.getSections('/Resources/Sites/%s' % (gridType))['OK']: return S_ERROR('/Resources/Sites/%s is not a valid site section' % (gridType)) result = gConfig.getOptionsDict('/Resources/Sites/%s/%s' % (gridType, site)) if printOutput and result['OK']: gLogger.notice(self.pPrint.pformat(result['Value'])) return result ############################################################################# def allowSite(self, site, comment, printOutput=False): """Adds the site to the site mask. Example usage: >>> gLogger.notice(diracAdmin.allowSite()) {'OK': True, 'Value': } :return: S_OK,S_ERROR """ result = self.__checkSiteIsValid(site) if not result['OK']: return result result = self.getSiteMask(status='Active') if not result['OK']: return result siteMask = result['Value'] if site in siteMask: if printOutput: gLogger.notice('Site %s is already Active' % site) return S_OK('Site %s is already Active' % site) if self.rssFlag: result = self.sitestatus.setSiteStatus(site, 'Active', comment) else: result = WMSAdministratorClient().allowSite(site, comment) if not result['OK']: return result if printOutput: gLogger.notice('Site %s status is set to Active' % site) return result ############################################################################# def getSiteMaskLogging(self, site=None, printOutput=False): """Retrieves site mask logging information. Example usage: >>> gLogger.notice(diracAdmin.getSiteMaskLogging('LCG.AUVER.fr')) {'OK': True, 'Value': } :return: S_OK,S_ERROR """ result = self.__checkSiteIsValid(site) if not result['OK']: return result if self.rssFlag: result = ResourceStatusClient().selectStatusElement('Site', 'History', name=site) else: result = WMSAdministratorClient().getSiteMaskLogging(site) if not result['OK']: return result if printOutput: if site: gLogger.notice('\nSite Mask Logging Info for %s\n' % site) else: gLogger.notice('\nAll Site Mask Logging Info\n') sitesLogging = result['Value'] if isinstance(sitesLogging, dict): for siteName, tupleList in sitesLogging.items( ): # can be an iterator if not siteName: gLogger.notice('\n===> %s\n' % siteName) for tup in tupleList: stup = str(tup[0]).ljust(8) + str(tup[1]).ljust(20) stup += '( ' + str(tup[2]).ljust(len(str( tup[2]))) + ' ) "' + str(tup[3]) + '"' gLogger.notice(stup) gLogger.notice(' ') elif isinstance(sitesLogging, list): sitesLoggingList = [(sl[1], sl[3], sl[4]) for sl in sitesLogging] for siteLog in sitesLoggingList: gLogger.notice(siteLog) return S_OK() ############################################################################# def banSite(self, site, comment, printOutput=False): """Removes the site from the site mask. Example usage: >>> gLogger.notice(diracAdmin.banSite()) {'OK': True, 'Value': } :return: S_OK,S_ERROR """ result = self.__checkSiteIsValid(site) if not result['OK']: return result mask = self.getSiteMask(status='Banned') if not mask['OK']: return mask siteMask = mask['Value'] if site in siteMask: if printOutput: gLogger.notice('Site %s is already Banned' % site) return S_OK('Site %s is already Banned' % site) if self.rssFlag: result = self.sitestatus.setSiteStatus(site, 'Banned', comment) else: result = WMSAdministratorClient().banSite(site, comment) if not result['OK']: return result if printOutput: gLogger.notice('Site %s status is set to Banned' % site) return result ############################################################################# def __checkSiteIsValid(self, site): """Internal function to check that a site name is valid. """ if isinstance(site, (list, set, dict)): site = set(site) - self._siteSet if not site: return S_OK() elif site in self._siteSet: return S_OK() return S_ERROR('Specified site %s is not in list of defined sites' % str(site)) ############################################################################# def getServicePorts(self, setup='', printOutput=False): """Checks the service ports for the specified setup. If not given this is taken from the current installation (/DIRAC/Setup) Example usage: >>> gLogger.notice(diracAdmin.getServicePorts()) {'OK': True, 'Value':''} :return: S_OK,S_ERROR """ if not setup: setup = gConfig.getValue('/DIRAC/Setup', '') setupList = gConfig.getSections('/DIRAC/Setups', []) if not setupList['OK']: return S_ERROR('Could not get /DIRAC/Setups sections') setupList = setupList['Value'] if setup not in setupList: return S_ERROR('Setup %s is not in allowed list: %s' % (setup, ', '.join(setupList))) serviceSetups = gConfig.getOptionsDict('/DIRAC/Setups/%s' % setup) if not serviceSetups['OK']: return S_ERROR('Could not get /DIRAC/Setups/%s options' % setup) serviceSetups = serviceSetups['Value'] # dict systemList = gConfig.getSections('/Systems') if not systemList['OK']: return S_ERROR('Could not get Systems sections') systemList = systemList['Value'] result = {} for system in systemList: if system in serviceSetups: path = '/Systems/%s/%s/Services' % (system, serviceSetups[system]) servicesList = gConfig.getSections(path) if not servicesList['OK']: self.log.warn('Could not get sections in %s' % path) else: servicesList = servicesList['Value'] if not servicesList: servicesList = [] self.log.verbose('System: %s ServicesList: %s' % (system, ', '.join(servicesList))) for service in servicesList: spath = '%s/%s/Port' % (path, service) servicePort = gConfig.getValue(spath, 0) if servicePort: self.log.verbose('Found port for %s/%s = %s' % (system, service, servicePort)) result['%s/%s' % (system, service)] = servicePort else: self.log.warn('No port found for %s' % spath) else: self.log.warn('%s is not defined in /DIRAC/Setups/%s' % (system, setup)) if printOutput: gLogger.notice(self.pPrint.pformat(result)) return S_OK(result) ############################################################################# def getProxy(self, userDN, userGroup, validity=43200, limited=False): """Retrieves a proxy with default 12hr validity and stores this in a file in the local directory by default. Example usage: >>> gLogger.notice(diracAdmin.getProxy()) {'OK': True, 'Value': } :return: S_OK,S_ERROR """ return gProxyManager.downloadProxy(userDN, userGroup, limited=limited, requiredTimeLeft=validity) ############################################################################# def getVOMSProxy(self, userDN, userGroup, vomsAttr=False, validity=43200, limited=False): """Retrieves a proxy with default 12hr validity and VOMS extensions and stores this in a file in the local directory by default. Example usage: >>> gLogger.notice(diracAdmin.getVOMSProxy()) {'OK': True, 'Value': } :return: S_OK,S_ERROR """ return gProxyManager.downloadVOMSProxy(userDN, userGroup, limited=limited, requiredVOMSAttribute=vomsAttr, requiredTimeLeft=validity) ############################################################################# def getPilotProxy(self, userDN, userGroup, validity=43200): """Retrieves a pilot proxy with default 12hr validity and stores this in a file in the local directory by default. Example usage: >>> gLogger.notice(diracAdmin.getVOMSProxy()) {'OK': True, 'Value': } :return: S_OK,S_ERROR """ return gProxyManager.getPilotProxyFromDIRACGroup( userDN, userGroup, requiredTimeLeft=validity) ############################################################################# def resetJob(self, jobID): """Reset a job or list of jobs in the WMS. This operation resets the reschedule counter for a job or list of jobs and allows them to run as new. Example:: >>> gLogger.notice(dirac.reset(12345)) {'OK': True, 'Value': [12345]} :param job: JobID :type job: integer or list of integers :return: S_OK,S_ERROR """ if isinstance(jobID, six.string_types): try: jobID = int(jobID) except Exception as x: return self._errorReport( str(x), 'Expected integer or convertible integer for existing jobID' ) elif isinstance(jobID, list): try: jobID = [int(job) for job in jobID] except Exception as x: return self._errorReport( str(x), 'Expected integer or convertible integer for existing jobIDs' ) result = JobManagerClient(useCertificates=False).resetJob(jobID) return result ############################################################################# def getJobPilotOutput(self, jobID, directory=''): """Retrieve the pilot output for an existing job in the WMS. The output will be retrieved in a local directory unless otherwise specified. >>> gLogger.notice(dirac.getJobPilotOutput(12345)) {'OK': True, StdOut:'',StdError:''} :param job: JobID :type job: integer or string :return: S_OK,S_ERROR """ if not directory: directory = self.currentDir if not os.path.exists(directory): return self._errorReport('Directory %s does not exist' % directory) result = WMSAdministratorClient().getJobPilotOutput(jobID) if not result['OK']: return result outputPath = '%s/pilot_%s' % (directory, jobID) if os.path.exists(outputPath): self.log.info('Remove %s and retry to continue' % outputPath) return S_ERROR('Remove %s and retry to continue' % outputPath) if not os.path.exists(outputPath): self.log.verbose('Creating directory %s' % outputPath) os.mkdir(outputPath) outputs = result['Value'] if 'StdOut' in outputs: stdout = '%s/std.out' % (outputPath) with open(stdout, 'w') as fopen: fopen.write(outputs['StdOut']) self.log.verbose('Standard output written to %s' % (stdout)) else: self.log.warn('No standard output returned') if 'StdError' in outputs: stderr = '%s/std.err' % (outputPath) with open(stderr, 'w') as fopen: fopen.write(outputs['StdError']) self.log.verbose('Standard error written to %s' % (stderr)) else: self.log.warn('No standard error returned') self.log.always('Outputs retrieved in %s' % outputPath) return result ############################################################################# def getPilotOutput(self, gridReference, directory=''): """Retrieve the pilot output (std.out and std.err) for an existing job in the WMS. >>> gLogger.notice(dirac.getJobPilotOutput(12345)) {'OK': True, 'Value': {}} :param job: JobID :type job: integer or string :return: S_OK,S_ERROR """ if not isinstance(gridReference, six.string_types): return self._errorReport('Expected string for pilot reference') if not directory: directory = self.currentDir if not os.path.exists(directory): return self._errorReport('Directory %s does not exist' % directory) result = PilotManagerClient().getPilotOutput(gridReference) if not result['OK']: return result gridReferenceSmall = gridReference.split('/')[-1] if not gridReferenceSmall: gridReferenceSmall = 'reference' outputPath = '%s/pilot_%s' % (directory, gridReferenceSmall) if os.path.exists(outputPath): self.log.info('Remove %s and retry to continue' % outputPath) return S_ERROR('Remove %s and retry to continue' % outputPath) if not os.path.exists(outputPath): self.log.verbose('Creating directory %s' % outputPath) os.mkdir(outputPath) outputs = result['Value'] if 'StdOut' in outputs: stdout = '%s/std.out' % (outputPath) with open(stdout, 'w') as fopen: fopen.write(outputs['StdOut']) self.log.info('Standard output written to %s' % (stdout)) else: self.log.warn('No standard output returned') if 'StdErr' in outputs: stderr = '%s/std.err' % (outputPath) with open(stderr, 'w') as fopen: fopen.write(outputs['StdErr']) self.log.info('Standard error written to %s' % (stderr)) else: self.log.warn('No standard error returned') self.log.always('Outputs retrieved in %s' % outputPath) return result ############################################################################# def getPilotInfo(self, gridReference): """Retrieve info relative to a pilot reference >>> gLogger.notice(dirac.getPilotInfo(12345)) {'OK': True, 'Value': {}} :param gridReference: Pilot Job Reference :type gridReference: string :return: S_OK,S_ERROR """ if not isinstance(gridReference, six.string_types): return self._errorReport('Expected string for pilot reference') result = PilotManagerClient().getPilotInfo(gridReference) return result ############################################################################# def killPilot(self, gridReference): """Kill the pilot specified >>> gLogger.notice(dirac.getPilotInfo(12345)) {'OK': True, 'Value': {}} :param gridReference: Pilot Job Reference :return: S_OK,S_ERROR """ if not isinstance(gridReference, six.string_types): return self._errorReport('Expected string for pilot reference') result = PilotManagerClient().killPilot(gridReference) return result ############################################################################# def getPilotLoggingInfo(self, gridReference): """Retrieve the pilot logging info for an existing job in the WMS. >>> gLogger.notice(dirac.getPilotLoggingInfo(12345)) {'OK': True, 'Value': {"The output of the command"}} :param gridReference: Gridp pilot job reference Id :type gridReference: string :return: S_OK,S_ERROR """ if not isinstance(gridReference, six.string_types): return self._errorReport('Expected string for pilot reference') return PilotManagerClient().getPilotLoggingInfo(gridReference) ############################################################################# def getJobPilots(self, jobID): """Extract the list of submitted pilots and their status for a given jobID from the WMS. Useful information is printed to the screen. >>> gLogger.notice(dirac.getJobPilots()) {'OK': True, 'Value': {PilotID:{StatusDict}}} :param job: JobID :type job: integer or string :return: S_OK,S_ERROR """ if isinstance(jobID, six.string_types): try: jobID = int(jobID) except Exception as x: return self._errorReport( str(x), 'Expected integer or string for existing jobID') result = PilotManagerClient().getPilots(jobID) if result['OK']: gLogger.notice(self.pPrint.pformat(result['Value'])) return result ############################################################################# def getPilotSummary(self, startDate='', endDate=''): """Retrieve the pilot output for an existing job in the WMS. Summary is printed at INFO level, full dictionary of results also returned. >>> gLogger.notice(dirac.getPilotSummary()) {'OK': True, 'Value': {CE:{Status:Count}}} :param job: JobID :type job: integer or string :return: S_OK,S_ERROR """ result = PilotManagerClient().getPilotSummary(startDate, endDate) if not result['OK']: return result ceDict = result['Value'] headers = 'CE'.ljust(28) i = 0 for ce, summary in ceDict.iteritems(): states = summary.keys() if len(states) > i: i = len(states) for i in xrange(i): headers += 'Status'.ljust(12) + 'Count'.ljust(12) gLogger.notice(headers) for ce, summary in ceDict.iteritems(): line = ce.ljust(28) states = sorted(summary) for state in states: count = str(summary[state]) line += state.ljust(12) + count.ljust(12) gLogger.notice(line) return result ############################################################################# def setSiteProtocols(self, site, protocolsList, printOutput=False): """ Allows to set the defined protocols for each SE for a given site. """ result = self.__checkSiteIsValid(site) if not result['OK']: return result siteSection = '/Resources/Sites/%s/%s/SE' % (site.split('.')[0], site) siteSEs = gConfig.getValue(siteSection, []) if not siteSEs: return S_ERROR('No SEs found for site %s in section %s' % (site, siteSection)) defaultProtocols = gConfig.getValue( '/Resources/StorageElements/DefaultProtocols', []) self.log.verbose('Default list of protocols are', ', '.join(defaultProtocols)) for protocol in protocolsList: if protocol not in defaultProtocols: return S_ERROR( 'Requested to set protocol %s in list but %s is not ' 'in default list of protocols:\n%s' % (protocol, protocol, ', '.join(defaultProtocols))) modifiedCS = False result = promptUser( 'Do you want to add the following default protocols:' ' %s for SE(s):\n%s' % (', '.join(protocolsList), ', '.join(siteSEs))) if not result['OK']: return result if result['Value'].lower() != 'y': self.log.always('No protocols will be added') return S_OK() for se in siteSEs: sections = gConfig.getSections('/Resources/StorageElements/%s/' % (se)) if not sections['OK']: return sections for section in sections['Value']: if gConfig.getValue( '/Resources/StorageElements/%s/%s/ProtocolName' % (se, section), '') == 'SRM2': path = '/Resources/StorageElements/%s/%s/ProtocolsList' % ( se, section) self.log.verbose('Setting %s to %s' % (path, ', '.join(protocolsList))) result = self.csSetOption(path, ', '.join(protocolsList)) if not result['OK']: return result modifiedCS = True if modifiedCS: result = self.csCommitChanges(False) if not result['OK']: return S_ERROR('CS Commit failed with message = %s' % (result['Message'])) else: if printOutput: gLogger.notice('Successfully committed changes to CS') else: if printOutput: gLogger.notice('No modifications to CS required') return S_OK() ############################################################################# def csSetOption(self, optionPath, optionValue): """ Function to modify an existing value in the CS. """ return self.csAPI.setOption(optionPath, optionValue) ############################################################################# def csSetOptionComment(self, optionPath, comment): """ Function to modify an existing value in the CS. """ return self.csAPI.setOptionComment(optionPath, comment) ############################################################################# def csModifyValue(self, optionPath, newValue): """ Function to modify an existing value in the CS. """ return self.csAPI.modifyValue(optionPath, newValue) ############################################################################# def csRegisterUser(self, username, properties): """ Registers a user in the CS. - username: Username of the user (easy;) - properties: Dict containing: - DN - groups : list/tuple of groups the user belongs to - <others> : More properties of the user, like mail """ return self.csAPI.addUser(username, properties) ############################################################################# def csDeleteUser(self, user): """ Deletes a user from the CS. Can take a list of users """ return self.csAPI.deleteUsers(user) ############################################################################# def csModifyUser(self, username, properties, createIfNonExistant=False): """ Modify a user in the CS. Takes the same params as in addUser and applies the changes """ return self.csAPI.modifyUser(username, properties, createIfNonExistant) ############################################################################# def csListUsers(self, group=False): """ Lists the users in the CS. If no group is specified return all users. """ return self.csAPI.listUsers(group) ############################################################################# def csDescribeUsers(self, mask=False): """ List users and their properties in the CS. If a mask is given, only users in the mask will be returned """ return self.csAPI.describeUsers(mask) ############################################################################# def csModifyGroup(self, groupname, properties, createIfNonExistant=False): """ Modify a user in the CS. Takes the same params as in addGroup and applies the changes """ return self.csAPI.modifyGroup(groupname, properties, createIfNonExistant) ############################################################################# def csListHosts(self): """ Lists the hosts in the CS """ return self.csAPI.listHosts() ############################################################################# def csDescribeHosts(self, mask=False): """ Gets extended info for the hosts in the CS """ return self.csAPI.describeHosts(mask) ############################################################################# def csModifyHost(self, hostname, properties, createIfNonExistant=False): """ Modify a host in the CS. Takes the same params as in addHost and applies the changes """ return self.csAPI.modifyHost(hostname, properties, createIfNonExistant) ############################################################################# def csListGroups(self): """ Lists groups in the CS """ return self.csAPI.listGroups() ############################################################################# def csDescribeGroups(self, mask=False): """ List groups and their properties in the CS. If a mask is given, only groups in the mask will be returned """ return self.csAPI.describeGroups(mask) ############################################################################# def csSyncUsersWithCFG(self, usersCFG): """ Synchronize users in cfg with its contents """ return self.csAPI.syncUsersWithCFG(usersCFG) ############################################################################# def csCommitChanges(self, sortUsers=True): """ Commit the changes in the CS """ return self.csAPI.commitChanges(sortUsers=False) ############################################################################# def sendMail(self, address, subject, body, fromAddress=None, localAttempt=True, html=False): """ Send mail to specified address with body. """ notification = NotificationClient() return notification.sendMail(address, subject, body, fromAddress, localAttempt, html) ############################################################################# def sendSMS(self, userName, body, fromAddress=None): """ Send mail to specified address with body. """ if len(body) > 160: return S_ERROR('Exceeded maximum SMS length of 160 characters') notification = NotificationClient() return notification.sendSMS(userName, body, fromAddress) ############################################################################# def getBDIISite(self, site, host=None): """ Get information about site from BDII at host """ return ldapSite(site, host=host) ############################################################################# def getBDIICluster(self, ce, host=None): """ Get information about ce from BDII at host """ return ldapCluster(ce, host=host) ############################################################################# def getBDIICE(self, ce, host=None): """ Get information about ce from BDII at host """ return ldapCE(ce, host=host) ############################################################################# def getBDIIService(self, ce, host=None): """ Get information about ce from BDII at host """ return ldapService(ce, host=host) ############################################################################# def getBDIICEState(self, ce, useVO=voName, host=None): """ Get information about ce state from BDII at host """ return ldapCEState(ce, useVO, host=host) ############################################################################# def getBDIICEVOView(self, ce, useVO=voName, host=None): """ Get information about ce voview from BDII at host """ return ldapCEVOView(ce, useVO, host=host)
def getPilotSummaryWeb( self, selectDict, sortList, startItem, maxItems ): """ Get summary of the pilot jobs status by CE/site in a standard structure """ stateNames = ['Submitted', 'Ready', 'Scheduled', 'Waiting', 'Running', 'Done', 'Aborted'] allStateNames = stateNames + ['Done_Empty', 'Aborted_Hour'] paramNames = ['Site', 'CE'] + allStateNames resultDict = {} last_update = None if selectDict.has_key( 'LastUpdateTime' ): last_update = selectDict['LastUpdateTime'] del selectDict['LastUpdateTime'] site_select = [] if selectDict.has_key( 'GridSite' ): site_select = selectDict['GridSite'] if type( site_select ) != type( [] ): site_select = [site_select] del selectDict['GridSite'] status_select = [] if selectDict.has_key( 'Status' ): status_select = selectDict['Status'] if type( status_select ) != type( [] ): status_select = [status_select] del selectDict['Status'] expand_site = '' if selectDict.has_key( 'ExpandSite' ): expand_site = selectDict['ExpandSite'] site_select = [expand_site] del selectDict['ExpandSite'] start = time.time() # Get all the data from the database with various selections result = self.getCounters( 'PilotAgents', ['GridSite', 'DestinationSite', 'Status'], selectDict, newer = last_update, timeStamp = 'LastUpdateTime' ) if not result['OK']: return result last_update = Time.dateTime() - Time.hour selectDict['Status'] = 'Aborted' resultHour = self.getCounters( 'PilotAgents', ['GridSite', 'DestinationSite', 'Status'], selectDict, newer = last_update, timeStamp = 'LastUpdateTime' ) if not resultHour['OK']: return resultHour last_update = Time.dateTime() - Time.day selectDict['Status'] = ['Aborted', 'Done'] resultDay = self.getCounters( 'PilotAgents', ['GridSite', 'DestinationSite', 'Status'], selectDict, newer = last_update, timeStamp = 'LastUpdateTime' ) if not resultDay['OK']: return resultDay selectDict['CurrentJobID'] = 0 selectDict['Status'] = 'Done' resultDayEmpty = self.getCounters( 'PilotAgents', ['GridSite', 'DestinationSite', 'Status'], selectDict, newer = last_update, timeStamp = 'LastUpdateTime' ) if not resultDayEmpty['OK']: return resultDayEmpty ceMap = {} resMap = getCESiteMapping() if resMap['OK']: ceMap = resMap['Value'] # Sort out different counters resultDict = {} resultDict['Unknown'] = {} for attDict, count in result['Value']: site = attDict['GridSite'] ce = attDict['DestinationSite'] state = attDict['Status'] if site == 'Unknown' and ce != "Unknown" and ce != "Multiple" and ceMap.has_key( ce ): site = ceMap[ce] if not resultDict.has_key( site ): resultDict[site] = {} if not resultDict[site].has_key( ce ): resultDict[site][ce] = {} for p in allStateNames: resultDict[site][ce][p] = 0 resultDict[site][ce][state] = count for attDict, count in resultDay['Value']: site = attDict['GridSite'] ce = attDict['DestinationSite'] state = attDict['Status'] if site == 'Unknown' and ce != "Unknown" and ceMap.has_key( ce ): site = ceMap[ce] if state == "Done": resultDict[site][ce]["Done"] = count if state == "Aborted": resultDict[site][ce]["Aborted"] = count for attDict, count in resultDayEmpty['Value']: site = attDict['GridSite'] ce = attDict['DestinationSite'] state = attDict['Status'] if site == 'Unknown' and ce != "Unknown" and ceMap.has_key( ce ): site = ceMap[ce] if state == "Done": resultDict[site][ce]["Done_Empty"] = count for attDict, count in resultHour['Value']: site = attDict['GridSite'] ce = attDict['DestinationSite'] state = attDict['Status'] if site == 'Unknown' and ce != "Unknown" and ceMap.has_key( ce ): site = ceMap[ce] if state == "Aborted": resultDict[site][ce]["Aborted_Hour"] = count records = [] siteSumDict = {} for site in resultDict: sumDict = {} for state in allStateNames: if not sumDict.has_key( state ): sumDict[state] = 0 sumDict['Total'] = 0 for ce in resultDict[site]: itemList = [site, ce] total = 0 for state in allStateNames: itemList.append( resultDict[site][ce][state] ) sumDict[state] += resultDict[site][ce][state] if state == "Done": done = resultDict[site][ce][state] if state == "Done_Empty": empty = resultDict[site][ce][state] if state == "Aborted": aborted = resultDict[site][ce][state] if state == "Aborted_Hour": aborted_hour = resultDict[site][ce][state] if state != "Aborted_Hour" and state != "Done_Empty": total += resultDict[site][ce][state] sumDict['Total'] += total # Add the total number of pilots seen in the last day itemList.append( total ) # Add pilot submission efficiency evaluation if ( done - empty ) > 0: eff = float( done ) / float( done - empty ) elif done == 0: eff = 0. elif empty == done: eff = 99. else: eff = 0. itemList.append( '%.2f' % eff ) # Add pilot job efficiency evaluation if total > 0: eff = float( total - aborted ) / float( total ) * 100. else: eff = 100. itemList.append( '%.2f' % eff ) # Evaluate the quality status of the CE if total > 10: if eff < 25.: itemList.append( 'Bad' ) elif eff < 60.: itemList.append( 'Poor' ) elif eff < 85.: itemList.append( 'Fair' ) else: itemList.append( 'Good' ) else: itemList.append( 'Idle' ) if len( resultDict[site] ) == 1 or expand_site: records.append( itemList ) if len( resultDict[site] ) > 1 and not expand_site: itemList = [site, 'Multiple'] for state in allStateNames + ['Total']: if sumDict.has_key( state ): itemList.append( sumDict[state] ) else: itemList.append( 0 ) done = sumDict["Done"] empty = sumDict["Done_Empty"] aborted = sumDict["Aborted"] aborted_hour = sumDict["Aborted_Hour"] total = sumDict["Total"] # Add pilot submission efficiency evaluation if ( done - empty ) > 0: eff = float( done ) / float( done - empty ) elif done == 0: eff = 0. elif empty == done: eff = 99. else: eff = 0. itemList.append( '%.2f' % eff ) # Add pilot job efficiency evaluation if total > 0: eff = float( total - aborted ) / float( total ) * 100. else: eff = 100. itemList.append( '%.2f' % eff ) # Evaluate the quality status of the Site if total > 10: if eff < 25.: itemList.append( 'Bad' ) elif eff < 60.: itemList.append( 'Poor' ) elif eff < 85.: itemList.append( 'Fair' ) else: itemList.append( 'Good' ) else: itemList.append( 'Idle' ) records.append( itemList ) for state in allStateNames + ['Total']: if not siteSumDict.has_key( state ): siteSumDict[state] = sumDict[state] else: siteSumDict[state] += sumDict[state] # Perform site selection if site_select: new_records = [] for r in records: if r[0] in site_select: new_records.append( r ) records = new_records # Perform status selection if status_select: new_records = [] for r in records: if r[14] in status_select: new_records.append( r ) records = new_records # Get the Site Mask data siteStatus = SiteStatus() for r in records: # #FIXME: using only ComputingAccess # if siteStatus.isUsableSite( r[0], 'ComputingAccess' ): r.append('Yes') else: r.append('No') finalDict = {} finalDict['TotalRecords'] = len( records ) finalDict['ParameterNames'] = paramNames + \ ['Total', 'PilotsPerJob', 'PilotJobEff', 'Status', 'InMask'] # Return all the records if maxItems == 0 or the specified number otherwise if maxItems: finalDict['Records'] = records[startItem:startItem + maxItems] else: finalDict['Records'] = records done = siteSumDict["Done"] empty = siteSumDict["Done_Empty"] aborted = siteSumDict["Aborted"] aborted_hour = siteSumDict["Aborted_Hour"] total = siteSumDict["Total"] # Add pilot submission efficiency evaluation if ( done - empty ) > 0: eff = float( done ) / float( done - empty ) elif done == 0: eff = 0. elif empty == done: eff = 99. else: eff = 0. siteSumDict['PilotsPerJob'] = '%.2f' % eff # Add pilot job efficiency evaluation if total > 0: eff = float( total - aborted ) / float( total ) * 100. else: eff = 100. siteSumDict['PilotJobEff'] = '%.2f' % eff # Evaluate the overall quality status if total > 100: if eff < 25.: siteSumDict['Status'] = 'Bad' elif eff < 60.: siteSumDict['Status'] = 'Poor' elif eff < 85.: siteSumDict['Status'] = 'Fair' else: siteSumDict['Status'] = 'Good' else: siteSumDict['Status'] = 'Idle' finalDict['Extras'] = siteSumDict return S_OK( finalDict )
def _resolveCECandidates(self, taskQueueDict): """ Return a list of CEs for this TaskQueue """ # assume user knows what they're doing and avoid site mask e.g. sam jobs if 'GridCEs' in taskQueueDict and taskQueueDict['GridCEs']: self.log.info( 'CEs requested by TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join(taskQueueDict['GridCEs'])) return taskQueueDict['GridCEs'] # Get the mask siteStatus = SiteStatus() ret = siteStatus.getUsableSites('ComputingAccess') if not ret['OK']: self.log.error('Can not retrieve site Mask from DB:', ret['Message']) return [] usableSites = ret['Value'] if not usableSites: self.log.error('Site mask is empty') return [] self.log.verbose('Site Mask: %s' % ', '.join(usableSites)) # remove banned sites from siteMask if 'BannedSites' in taskQueueDict: for site in taskQueueDict['BannedSites']: if site in usableSites: usableSites.remove(site) self.log.verbose('Removing banned site %s from site Mask' % site) # remove from the mask if a Site is given siteMask = [ site for site in usableSites if 'Sites' not in taskQueueDict or site in taskQueueDict['Sites'] ] if not siteMask: # pilot can not be submitted self.log.info('No Valid Site Candidate in Mask for TaskQueue %s' % taskQueueDict['TaskQueueID']) return [] self.log.info( 'Site Candidates for TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join(siteMask)) # Get CE's associates to the given site Names ceMask = [] resources = Resources(vo=self.virtualOrganization) result = resources.getEligibleResources( 'Computing', { 'Site': siteMask, 'SubmissionMode': 'gLite', 'CEType': ['LCG', 'CREAM'] }) if not result['OK']: self.log.error("Failed to get eligible ce's:", result['Message']) return [] ces = result['Value'] for ce in ces: ceHost = resources.getComputingElementValue(ce, 'Host', 'unknown') if ceHost != 'unknown': ceMask.append(ceHost) if not ceMask: self.log.info( 'No CE Candidate found for TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join(siteMask)) self.log.verbose( 'CE Candidates for TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join(ceMask)) return ceMask
def initializeOptimizer(cls): """ Initialization of the optimizer. """ cls.siteClient = SiteStatus() cls.__jobDB = JobDB() return S_OK()
class Matcher(object): """ Logic for matching """ def __init__(self, pilotAgentsDB=None, jobDB=None, tqDB=None, jlDB=None, opsHelper=None): """ c'tor """ if pilotAgentsDB: self.pilotAgentsDB = pilotAgentsDB else: self.pilotAgentsDB = PilotAgentsDB() if jobDB: self.jobDB = jobDB else: self.jobDB = JobDB() if tqDB: self.tqDB = tqDB else: self.tqDB = TaskQueueDB() if jlDB: self.jlDB = jlDB else: self.jlDB = JobLoggingDB() if opsHelper: self.opsHelper = opsHelper else: self.opsHelper = Operations() self.log = gLogger.getSubLogger("Matcher") self.limiter = Limiter(jobDB=self.jobDB, opsHelper=self.opsHelper) self.siteClient = SiteStatus() def selectJob(self, resourceDescription, credDict): """ Main job selection function to find the highest priority job matching the resource capacity """ startTime = time.time() resourceDict = self._getResourceDict(resourceDescription, credDict) # Make a nice print of the resource matching parameters toPrintDict = dict(resourceDict) if "MaxRAM" in resourceDescription: toPrintDict['MaxRAM'] = resourceDescription['MaxRAM'] if "NumberOfProcessors" in resourceDescription: toPrintDict['NumberOfProcessors'] = resourceDescription['NumberOfProcessors'] toPrintDict['Tag'] = [] if "Tag" in resourceDict: for tag in resourceDict['Tag']: if not tag.endswith('GB') and not tag.endswith('Processors'): toPrintDict['Tag'].append(tag) if not toPrintDict['Tag']: toPrintDict.pop('Tag') gLogger.info('Resource description for matching', printDict(toPrintDict)) negativeCond = self.limiter.getNegativeCondForSite(resourceDict['Site']) result = self.tqDB.matchAndGetJob(resourceDict, negativeCond=negativeCond) if not result['OK']: raise RuntimeError(result['Message']) result = result['Value'] if not result['matchFound']: self.log.info("No match found") return {} jobID = result['jobId'] resAtt = self.jobDB.getJobAttributes(jobID, ['OwnerDN', 'OwnerGroup', 'Status']) if not resAtt['OK']: raise RuntimeError('Could not retrieve job attributes') if not resAtt['Value']: raise RuntimeError("No attributes returned for job") if not resAtt['Value']['Status'] == 'Waiting': self.log.error('Job matched by the TQ is not in Waiting state', str(jobID)) result = self.tqDB.deleteJob(jobID) if not result['OK']: raise RuntimeError(result['Message']) raise RuntimeError("Job %s is not in Waiting state" % str(jobID)) self._reportStatus(resourceDict, jobID) result = self.jobDB.getJobJDL(jobID) if not result['OK']: raise RuntimeError("Failed to get the job JDL") resultDict = {} resultDict['JDL'] = result['Value'] resultDict['JobID'] = jobID matchTime = time.time() - startTime self.log.info("Match time: [%s]" % str(matchTime)) gMonitor.addMark("matchTime", matchTime) # Get some extra stuff into the response returned resOpt = self.jobDB.getJobOptParameters(jobID) if resOpt['OK']: for key, value in resOpt['Value'].items(): resultDict[key] = value resAtt = self.jobDB.getJobAttributes(jobID, ['OwnerDN', 'OwnerGroup']) if not resAtt['OK']: raise RuntimeError('Could not retrieve job attributes') if not resAtt['Value']: raise RuntimeError('No attributes returned for job') if self.opsHelper.getValue("JobScheduling/CheckMatchingDelay", True): self.limiter.updateDelayCounters(resourceDict['Site'], jobID) pilotInfoReportedFlag = resourceDict.get('PilotInfoReportedFlag', False) if not pilotInfoReportedFlag: self._updatePilotInfo(resourceDict) self._updatePilotJobMapping(resourceDict, jobID) resultDict['DN'] = resAtt['Value']['OwnerDN'] resultDict['Group'] = resAtt['Value']['OwnerGroup'] resultDict['PilotInfoReportedFlag'] = True return resultDict def _getResourceDict(self, resourceDescription, credDict): """ from resourceDescription to resourceDict (just various mods) """ resourceDict = self._processResourceDescription(resourceDescription) resourceDict = self._checkCredentials(resourceDict, credDict) self._checkPilotVersion(resourceDict) if not self._checkMask(resourceDict): # Banned destinations can only take Test jobs resourceDict['JobType'] = 'Test' self.log.verbose("Resource description:") for key in resourceDict: self.log.verbose("%s : %s" % (key.rjust(20), resourceDict[key])) return resourceDict def _processResourceDescription(self, resourceDescription): """ Check and form the resource description dictionary :param resourceDescription: a ceDict coming from a JobAgent, for example. :return: updated dictionary of resource description parameters """ resourceDict = {} for name in singleValueDefFields: if name in resourceDescription: resourceDict[name] = resourceDescription[name] for name in multiValueMatchFields: if name in resourceDescription: resourceDict[name] = resourceDescription[name] if resourceDescription.get('Tag'): resourceDict['Tag'] = resourceDescription['Tag'] if 'RequiredTag' in resourceDescription: resourceDict['RequiredTag'] = resourceDescription['RequiredTag'] if 'JobID' in resourceDescription: resourceDict['JobID'] = resourceDescription['JobID'] # Convert MaxRAM and NumberOfProcessors parameters into a list of tags maxRAM = resourceDescription.get('MaxRAM') if maxRAM: try: maxRAM = int(maxRAM) / 1000 except ValueError: maxRAM = None nProcessors = resourceDescription.get('NumberOfProcessors') if nProcessors: try: nProcessors = int(nProcessors) except ValueError: nProcessors = None for param, key in [(maxRAM, 'GB'), (nProcessors, 'Processors')]: if param and param <= 128: paramList = range(2, param + 1) paramTags = ['%d%s' % (par, key) for par in paramList] if paramTags: resourceDict.setdefault("Tag", []).extend(paramTags) # Add 'MultiProcessor' to the list of tags if nProcessors > 1: resourceDict.setdefault("Tag", []).append("MultiProcessor") # Add 'WholeNode' to the list of tags if "WholeNode" in resourceDescription: resourceDict.setdefault("Tag", []).append("WholeNode") if 'Tag' in resourceDict: resourceDict['Tag'] = list(set(resourceDict['Tag'])) for k in ('DIRACVersion', 'ReleaseVersion', 'ReleaseProject', 'VirtualOrganization', 'PilotReference', 'PilotBenchmark', 'PilotInfoReportedFlag'): if k in resourceDescription: resourceDict[k] = resourceDescription[k] return resourceDict def _reportStatus(self, resourceDict, jobID): """ Reports the status of the matched job in jobDB and jobLoggingDB Do not fail if errors happen here """ attNames = ['Status', 'MinorStatus', 'ApplicationStatus', 'Site'] attValues = ['Matched', 'Assigned', 'Unknown', resourceDict['Site']] result = self.jobDB.setJobAttributes(jobID, attNames, attValues) if not result['OK']: self.log.error("Problem reporting job status", "setJobAttributes, jobID = %s: %s" % (jobID, result['Message'])) else: self.log.verbose("Set job attributes for jobID %s" % jobID) result = self.jlDB.addLoggingRecord(jobID, status='Matched', minor='Assigned', source='Matcher') if not result['OK']: self.log.error("Problem reporting job status", "addLoggingRecord, jobID = %s: %s" % (jobID, result['Message'])) else: self.log.verbose("Added logging record for jobID %s" % jobID) def _checkMask(self, resourceDict): """ Check the mask: are we allowed to run normal jobs? FIXME: should we move to site OR SE? """ if 'Site' not in resourceDict: self.log.error("Missing Site Name in Resource JDL") raise RuntimeError("Missing Site Name in Resource JDL") # Check if site is allowed result = self.siteClient.getUsableSites(resourceDict['Site']) if not result['OK']: self.log.error("Internal error", "siteClient.getUsableSites: %s" % result['Message']) raise RuntimeError("Internal error") if resourceDict['Site'] not in result['Value']: return False return True def _updatePilotInfo(self, resourceDict): """ Update pilot information - do not fail if we don't manage to do it """ pilotReference = resourceDict.get('PilotReference', '') if pilotReference: gridCE = resourceDict.get('GridCE', 'Unknown') site = resourceDict.get('Site', 'Unknown') benchmark = resourceDict.get('PilotBenchmark', 0.0) self.log.verbose('Reporting pilot info for %s: gridCE=%s, site=%s, benchmark=%f' % (pilotReference, gridCE, site, benchmark)) result = self.pilotAgentsDB.setPilotStatus(pilotReference, status='Running', gridSite=site, destination=gridCE, benchmark=benchmark) if not result['OK']: self.log.warn("Problem updating pilot information", "; setPilotStatus. pilotReference: %s; %s" % (pilotReference, result['Message'])) def _updatePilotJobMapping(self, resourceDict, jobID): """ Update pilot to job mapping information """ pilotReference = resourceDict.get('PilotReference', '') if pilotReference: result = self.pilotAgentsDB.setCurrentJobID(pilotReference, jobID) if not result['OK']: self.log.error("Problem updating pilot information", ";setCurrentJobID. pilotReference: %s; %s" % (pilotReference, result['Message'])) result = self.pilotAgentsDB.setJobForPilot(jobID, pilotReference, updateStatus=False) if not result['OK']: self.log.error("Problem updating pilot information", "; setJobForPilot. pilotReference: %s; %s" % (pilotReference, result['Message'])) def _checkCredentials(self, resourceDict, credDict): """ Check if we can get a job given the passed credentials """ if Properties.GENERIC_PILOT in credDict['properties']: # You can only match groups in the same VO if credDict['group'] == "hosts": # for the host case the VirtualOrganization parameter # is mandatory in resourceDict vo = resourceDict.get('VirtualOrganization', '') else: vo = Registry.getVOForGroup(credDict['group']) if 'OwnerGroup' not in resourceDict: result = Registry.getGroupsForVO(vo) if result['OK']: resourceDict['OwnerGroup'] = result['Value'] else: raise RuntimeError(result['Message']) else: # If it's a private pilot, the DN has to be the same if Properties.PILOT in credDict['properties']: self.log.notice("Setting the resource DN to the credentials DN") resourceDict['OwnerDN'] = credDict['DN'] # If it's a job sharing. The group has to be the same and just check that the DN (if any) # belongs to the same group elif Properties.JOB_SHARING in credDict['properties']: resourceDict['OwnerGroup'] = credDict['group'] self.log.notice("Setting the resource group to the credentials group") if 'OwnerDN' in resourceDict and resourceDict['OwnerDN'] != credDict['DN']: ownerDN = resourceDict['OwnerDN'] result = Registry.getGroupsForDN(resourceDict['OwnerDN']) if not result['OK']: raise RuntimeError(result['Message']) if credDict['group'] not in result['Value']: # DN is not in the same group! bad boy. self.log.notice("You cannot request jobs from DN %s. It does not belong to your group!" % ownerDN) resourceDict['OwnerDN'] = credDict['DN'] # Nothing special, group and DN have to be the same else: resourceDict['OwnerDN'] = credDict['DN'] resourceDict['OwnerGroup'] = credDict['group'] return resourceDict def _checkPilotVersion(self, resourceDict): """ Check the pilot DIRAC version """ if self.opsHelper.getValue("Pilot/CheckVersion", True): if 'ReleaseVersion' not in resourceDict: if 'DIRACVersion' not in resourceDict: raise RuntimeError('Version check requested and not provided by Pilot') else: pilotVersion = resourceDict['DIRACVersion'] else: pilotVersion = resourceDict['ReleaseVersion'] validVersions = self.opsHelper.getValue("Pilot/Version", []) if validVersions and pilotVersion not in validVersions: raise RuntimeError('Pilot version does not match the production version %s not in ( %s )' % (pilotVersion, ",".join(validVersions))) # Check project if requested validProject = self.opsHelper.getValue("Pilot/Project", "") if validProject: if 'ReleaseProject' not in resourceDict: raise RuntimeError("Version check requested but expected project %s not received" % validProject) if resourceDict['ReleaseProject'] != validProject: raise RuntimeError("Version check requested \ but expected project %s != received %s" % (validProject, resourceDict['ReleaseProject']))
def main(): global fullMatch global sites Script.registerSwitch("F", "full-match", "Check all the matching criteria", setFullMatch) Script.registerSwitch( "S:", "site=", "Check matching for these sites (comma separated list)", setSites) Script.parseCommandLine(ignoreErrors=True) args = Script.getPositionalArgs() if len(args) == 0: gLogger.error("Error: No job description provided") Script.showHelp(exitCode=1) from DIRAC.Core.Security.ProxyInfo import getVOfromProxyGroup from DIRAC.ConfigurationSystem.Client.Helpers import Resources from DIRAC.Core.Utilities.PrettyPrint import printTable from DIRAC.ResourceStatusSystem.Client.ResourceStatus import ResourceStatus from DIRAC.ResourceStatusSystem.Client.SiteStatus import SiteStatus from DIRAC.WorkloadManagementSystem.Utilities.QueueUtilities import getQueuesResolved, matchQueue with open(args[0]) as f: jdl = f.read() # Get the current VO result = getVOfromProxyGroup() if not result['OK']: gLogger.error('No proxy found, please login') DIRACExit(-1) voName = result['Value'] resultQueues = Resources.getQueues(siteList=sites, community=voName) if not resultQueues['OK']: gLogger.error('Failed to get CE information') DIRACExit(-1) siteDict = resultQueues['Value'] result = getQueuesResolved(siteDict) if not resultQueues['OK']: gLogger.error('Failed to get CE information') DIRACExit(-1) queueDict = result['Value'] # get list of usable sites within this cycle resultMask = SiteStatus().getUsableSites() if not resultMask['OK']: gLogger.error('Failed to get Site mask information') DIRACExit(-1) siteMaskList = resultMask.get('Value', []) rssClient = ResourceStatus() fields = ('Site', 'CE', 'Queue', 'Status', 'Match', 'Reason') records = [] for queue, queueInfo in queueDict.items(): site = queueInfo['Site'] ce = queueInfo['CEName'] siteStatus = "Active" if site in siteMaskList else "InActive" ceStatus = siteStatus if rssClient.rssFlag: result = rssClient.getElementStatus(ce, "ComputingElement") if result['OK']: ceStatus = result['Value'][ce]['all'] result = matchQueue(jdl, queueInfo, fullMatch=fullMatch) if not result['OK']: gLogger.error('Failed in getting match data', result['Message']) DIRACExit(-1) status = "Active" if siteStatus == "Active" and ceStatus == "Active" else "Inactive" if result['Value']['Match']: records.append((site, ce, queueInfo['Queue'], status, 'Yes', '')) else: records.append((site, ce, queueInfo['Queue'], status, 'No', result['Value']['Reason'])) gLogger.notice( printTable(fields, records, sortField='Site', columnSeparator=' ', printOut=False))
class CloudDirector(AgentModule): """The CloudDirector works like a SiteDirector for cloud sites: It looks at the queued jobs in the task queues and attempts to start VM instances to meet the current demand. """ def __init__(self, *args, **kwargs): super(CloudDirector, self).__init__(*args, **kwargs) self.vmTypeDict = {} self.vmTypeCECache = {} self.vmTypeSlots = {} self.failedVMTypes = defaultdict(int) self.firstPass = True self.vo = "" self.group = "" # self.voGroups contain all the eligible user groups for clouds submitted by this SiteDirector self.voGroups = [] self.cloudDN = "" self.cloudGroup = "" self.platforms = [] self.sites = [] self.siteClient = None self.proxy = None self.updateStatus = True self.getOutput = False self.sendAccounting = True def initialize(self): self.siteClient = SiteStatus() return S_OK() def beginExecution(self): # The Director is for a particular user community self.vo = self.am_getOption("VO", "") if not self.vo: self.vo = CSGlobals.getVO() # The SiteDirector is for a particular user group self.group = self.am_getOption("Group", "") # Choose the group for which clouds will be submitted. This is a hack until # we will be able to match clouds to VOs. if not self.group: if self.vo: result = Registry.getGroupsForVO(self.vo) if not result["OK"]: return result self.voGroups = [] for group in result["Value"]: if "NormalUser" in Registry.getPropertiesForGroup(group): self.voGroups.append(group) else: self.voGroups = [self.group] result = findGenericCloudCredentials(vo=self.vo) if not result["OK"]: return result self.cloudDN, self.cloudGroup = result["Value"] self.maxVMsToSubmit = self.am_getOption("MaxVMsToSubmit", 1) self.runningPod = self.am_getOption("RunningPod", self.vo) # Get the site description dictionary siteNames = None if not self.am_getOption("Site", "Any").lower() == "any": siteNames = self.am_getOption("Site", []) if not siteNames: siteNames = None ces = None if not self.am_getOption("CEs", "Any").lower() == "any": ces = self.am_getOption("CEs", []) if not ces: ces = None result = getVMTypes(vo=self.vo, siteList=siteNames) if not result["OK"]: return result resourceDict = result["Value"] result = self.getEndpoints(resourceDict) if not result["OK"]: return result # if not siteNames: # siteName = gConfig.getValue( '/DIRAC/Site', 'Unknown' ) # if siteName == 'Unknown': # return S_OK( 'No site specified for the SiteDirector' ) # else: # siteNames = [siteName] # self.siteNames = siteNames self.log.always("Sites:", siteNames) self.log.always("CEs:", ces) self.log.always("CloudDN:", self.cloudDN) self.log.always("CloudGroup:", self.cloudGroup) self.localhost = socket.getfqdn() self.proxy = "" if self.firstPass: if self.vmTypeDict: self.log.always("Agent will serve VM types:") for vmType in self.vmTypeDict: self.log.always( "Site: %s, CE: %s, VMType: %s" % (self.vmTypeDict[vmType]["Site"], self.vmTypeDict[vmType]["CEName"], vmType) ) self.firstPass = False return S_OK() def __generateVMTypeHash(self, vmTypeDict): """Generate a hash of the queue description""" myMD5 = hashlib.md5() myMD5.update(str(sorted(vmTypeDict.items())).encode()) hexstring = myMD5.hexdigest() return hexstring def getEndpoints(self, resourceDict): """Get the list of relevant CEs and their descriptions""" self.vmTypeDict = {} ceFactory = EndpointFactory() result = getPilotBootstrapParameters(vo=self.vo, runningPod=self.runningPod) if not result["OK"]: return result opParameters = result["Value"] for site in resourceDict: for ce in resourceDict[site]: ceDict = resourceDict[site][ce] ceTags = ceDict.get("Tag", []) if isinstance(ceTags, six.string_types): ceTags = fromChar(ceTags) ceMaxRAM = ceDict.get("MaxRAM", None) qDict = ceDict.pop("VMTypes") for vmType in qDict: vmTypeName = "%s_%s" % (ce, vmType) self.vmTypeDict[vmTypeName] = {} self.vmTypeDict[vmTypeName]["ParametersDict"] = qDict[vmType] self.vmTypeDict[vmTypeName]["ParametersDict"]["VMType"] = vmType self.vmTypeDict[vmTypeName]["ParametersDict"]["Site"] = site self.vmTypeDict[vmTypeName]["ParametersDict"]["Setup"] = gConfig.getValue("/DIRAC/Setup", "unknown") self.vmTypeDict[vmTypeName]["ParametersDict"]["CPUTime"] = 99999999 vmTypeTags = self.vmTypeDict[vmTypeName]["ParametersDict"].get("Tag") if vmTypeTags and isinstance(vmTypeTags, six.string_types): vmTypeTags = fromChar(vmTypeTags) self.vmTypeDict[vmTypeName]["ParametersDict"]["Tag"] = vmTypeTags if ceTags: if vmTypeTags: allTags = list(set(ceTags + vmTypeTags)) self.vmTypeDict[vmTypeName]["ParametersDict"]["Tag"] = allTags else: self.vmTypeDict[vmTypeName]["ParametersDict"]["Tag"] = ceTags maxRAM = self.vmTypeDict[vmTypeName]["ParametersDict"].get("MaxRAM") maxRAM = ceMaxRAM if not maxRAM else maxRAM if maxRAM: self.vmTypeDict[vmTypeName]["ParametersDict"]["MaxRAM"] = maxRAM ceWholeNode = ceDict.get("WholeNode", "true") wholeNode = self.vmTypeDict[vmTypeName]["ParametersDict"].get("WholeNode", ceWholeNode) if wholeNode.lower() in ("yes", "true"): self.vmTypeDict[vmTypeName]["ParametersDict"].setdefault("Tag", []) self.vmTypeDict[vmTypeName]["ParametersDict"]["Tag"].append("WholeNode") platform = "" if "Platform" in self.vmTypeDict[vmTypeName]["ParametersDict"]: platform = self.vmTypeDict[vmTypeName]["ParametersDict"]["Platform"] elif "Platform" in ceDict: platform = ceDict["Platform"] if platform and platform not in self.platforms: self.platforms.append(platform) if "Platform" not in self.vmTypeDict[vmTypeName]["ParametersDict"] and platform: result = Resources.getDIRACPlatform(platform) if result["OK"]: self.vmTypeDict[vmTypeName]["ParametersDict"]["Platform"] = result["Value"][0] ceVMTypeDict = dict(ceDict) ceVMTypeDict["CEName"] = ce ceVMTypeDict["VO"] = self.vo ceVMTypeDict["VMType"] = vmType ceVMTypeDict["RunningPod"] = self.runningPod ceVMTypeDict["CSServers"] = gConfig.getValue("/DIRAC/Configuration/Servers", []) ceVMTypeDict.update(self.vmTypeDict[vmTypeName]["ParametersDict"]) # Allow a resource-specifc CAPath to be set (as some clouds have their own CAs) # Otherwise fall back to the system-wide default(s) if "CAPath" not in ceVMTypeDict: ceVMTypeDict["CAPath"] = gConfig.getValue( "/DIRAC/Security/CAPath", "/opt/dirac/etc/grid-security/certificates/cas.pem" ) # Generate the CE object for the vmType or pick the already existing one # if the vmType definition did not change vmTypeHash = self.__generateVMTypeHash(ceVMTypeDict) if vmTypeName in self.vmTypeCECache and self.vmTypeCECache[vmTypeName]["Hash"] == vmTypeHash: vmTypeCE = self.vmTypeCECache[vmTypeName]["CE"] else: result = ceFactory.getCEObject(parameters=ceVMTypeDict) if not result["OK"]: return result self.vmTypeCECache.setdefault(vmTypeName, {}) self.vmTypeCECache[vmTypeName]["Hash"] = vmTypeHash self.vmTypeCECache[vmTypeName]["CE"] = result["Value"] vmTypeCE = self.vmTypeCECache[vmTypeName]["CE"] vmTypeCE.setBootstrapParameters(opParameters) self.vmTypeDict[vmTypeName]["CE"] = vmTypeCE self.vmTypeDict[vmTypeName]["CEName"] = ce self.vmTypeDict[vmTypeName]["CEType"] = ceDict["CEType"] self.vmTypeDict[vmTypeName]["Site"] = site self.vmTypeDict[vmTypeName]["VMType"] = vmType self.vmTypeDict[vmTypeName]["Platform"] = platform self.vmTypeDict[vmTypeName]["MaxInstances"] = ceDict["MaxInstances"] if not self.vmTypeDict[vmTypeName]["CE"].isValid(): self.log.error("Failed to instantiate CloudEndpoint for %s" % vmTypeName) continue if site not in self.sites: self.sites.append(site) return S_OK() def execute(self): """Main execution method""" if not self.vmTypeDict: self.log.warn("No site defined, exiting the cycle") return S_OK() result = self.createVMs() if not result["OK"]: self.log.error("Errors in the job submission: ", result["Message"]) # cyclesDone = self.am_getModuleParam( 'cyclesDone' ) # if self.updateStatus and cyclesDone % self.cloudStatusUpdateCycleFactor == 0: # result = self.updatePilotStatus() # if not result['OK']: # self.log.error( 'Errors in updating cloud status: ', result['Message'] ) return S_OK() def createVMs(self): """Go through defined computing elements and submit jobs if necessary""" vmTypeList = list(self.vmTypeDict.keys()) # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = {"Setup": setup, "CPUTime": 9999999} if self.vo: tqDict["VO"] = self.vo if self.voGroups: tqDict["OwnerGroup"] = self.voGroups result = Resources.getCompatiblePlatforms(self.platforms) if not result["OK"]: return result tqDict["Platform"] = result["Value"] tqDict["Site"] = self.sites tags = [] for vmType in vmTypeList: if "Tag" in self.vmTypeDict[vmType]["ParametersDict"]: tags += self.vmTypeDict[vmType]["ParametersDict"]["Tag"] tqDict["Tag"] = list(set(tags)) self.log.verbose("Checking overall TQ availability with requirements") self.log.verbose(tqDict) matcherClient = MatcherClient() result = matcherClient.getMatchingTaskQueues(tqDict) if not result["OK"]: return result if not result["Value"]: self.log.verbose("No Waiting jobs suitable for the director") return S_OK() jobSites = set() anySite = False testSites = set() totalWaitingJobs = 0 for tqID in result["Value"]: if "Sites" in result["Value"][tqID]: for site in result["Value"][tqID]["Sites"]: if site.lower() != "any": jobSites.add(site) else: anySite = True else: anySite = True if "JobTypes" in result["Value"][tqID]: if "Sites" in result["Value"][tqID]: for site in result["Value"][tqID]["Sites"]: if site.lower() != "any": testSites.add(site) totalWaitingJobs += result["Value"][tqID]["Jobs"] tqIDList = list(result["Value"].keys()) result = virtualMachineDB.getInstanceCounters("Status", {}) totalVMs = 0 if result["OK"]: for status in result["Value"]: if status in ["New", "Submitted", "Running"]: totalVMs += result["Value"][status] self.log.info("Total %d jobs in %d task queues with %d VMs" % (totalWaitingJobs, len(tqIDList), totalVMs)) # Check if the site is allowed in the mask result = self.siteClient.getUsableSites() if not result["OK"]: return S_ERROR("Can not get the site mask") siteMaskList = result.get("Value", []) vmTypeList = list(self.vmTypeDict.keys()) random.shuffle(vmTypeList) totalSubmittedPilots = 0 matchedQueues = 0 for vmType in vmTypeList: ce = self.vmTypeDict[vmType]["CE"] ceName = self.vmTypeDict[vmType]["CEName"] vmTypeName = self.vmTypeDict[vmType]["VMType"] siteName = self.vmTypeDict[vmType]["Site"] platform = self.vmTypeDict[vmType]["Platform"] vmTypeTags = self.vmTypeDict[vmType]["ParametersDict"].get("Tag", []) siteMask = siteName in siteMaskList endpoint = "%s::%s" % (siteName, ceName) maxInstances = int(self.vmTypeDict[vmType]["MaxInstances"]) processorTags = [] # vms support WholeNode naturally processorTags.append("WholeNode") if not anySite and siteName not in jobSites: self.log.verbose("Skipping queue %s at %s: no workload expected" % (vmTypeName, siteName)) continue if not siteMask and siteName not in testSites: self.log.verbose("Skipping queue %s: site %s not in the mask" % (vmTypeName, siteName)) continue if "CPUTime" in self.vmTypeDict[vmType]["ParametersDict"]: vmTypeCPUTime = int(self.vmTypeDict[vmType]["ParametersDict"]["CPUTime"]) else: self.log.warn("CPU time limit is not specified for queue %s, skipping..." % vmType) continue # Prepare the queue description to look for eligible jobs ceDict = ce.getParameterDict() if not siteMask: ceDict["JobType"] = "Test" if self.vo: ceDict["VO"] = self.vo if self.voGroups: ceDict["OwnerGroup"] = self.voGroups result = Resources.getCompatiblePlatforms(platform) if not result["OK"]: continue ceDict["Platform"] = result["Value"] ceDict["Tag"] = list(set(processorTags + vmTypeTags)) # Get the number of eligible jobs for the target site/queue result = matcherClient.getMatchingTaskQueues(ceDict) if not result["OK"]: self.log.error("Could not retrieve TaskQueues from TaskQueueDB", result["Message"]) return result taskQueueDict = result["Value"] if not taskQueueDict: self.log.verbose("No matching TQs found for %s" % vmType) continue matchedQueues += 1 totalTQJobs = 0 tqIDList = list(taskQueueDict.keys()) for tq in taskQueueDict: totalTQJobs += taskQueueDict[tq]["Jobs"] self.log.verbose( "%d job(s) from %d task queue(s) are eligible for %s queue" % (totalTQJobs, len(tqIDList), vmType) ) # Get the number of already instantiated VMs for these task queues totalWaitingVMs = 0 result = virtualMachineDB.getInstanceCounters("Status", {"Endpoint": endpoint}) if result["OK"]: for status in result["Value"]: if status in ["New", "Submitted"]: totalWaitingVMs += result["Value"][status] if totalWaitingVMs >= totalTQJobs: self.log.verbose("%d VMs already for all the available jobs" % totalWaitingVMs) self.log.verbose("%d VMs for the total of %d eligible jobs for %s" % (totalWaitingVMs, totalTQJobs, vmType)) # Get proxy to be used to connect to the cloud endpoint authType = ce.parameters.get("Auth") if authType and authType.lower() in ["x509", "voms"]: self.log.verbose("Getting cloud proxy for %s/%s" % (siteName, ceName)) result = getProxyFileForCloud(ce) if not result["OK"]: continue ce.setProxy(result["Value"]) # Get the number of available slots on the target site/endpoint totalSlots = self.getVMInstances(endpoint, maxInstances) if totalSlots == 0: self.log.debug("%s: No slots available" % vmType) continue vmsToSubmit = max(0, min(totalSlots, totalTQJobs - totalWaitingVMs)) self.log.info( "%s: Slots=%d, TQ jobs=%d, VMs: %d, to submit=%d" % (vmType, totalSlots, totalTQJobs, totalWaitingVMs, vmsToSubmit) ) # Limit the number of VM instances to create to vmsToSubmit vmsToSubmit = min(self.maxVMsToSubmit, vmsToSubmit) if vmsToSubmit == 0: continue self.log.info("Going to submit %d VMs to %s queue" % (vmsToSubmit, vmType)) result = ce.createInstances(vmsToSubmit) # result = S_OK() if not result["OK"]: self.log.error("Failed submission to queue %s:\n" % vmType, result["Message"]) self.failedVMTypes.setdefault(vmType, 0) self.failedVMTypes[vmType] += 1 continue # Add VMs to the VirtualMachineDB vmDict = result["Value"] totalSubmittedPilots += len(vmDict) self.log.info("Submitted %d VMs to %s@%s" % (len(vmDict), vmTypeName, ceName)) pilotList = [] for uuID in vmDict: diracUUID = vmDict[uuID]["InstanceID"] endpoint = "%s::%s" % (self.vmTypeDict[vmType]["Site"], ceName) result = virtualMachineDB.insertInstance(uuID, vmTypeName, diracUUID, endpoint, self.vo) if not result["OK"]: continue pRef = "vm://" + ceName + "/" + diracUUID + ":00" pilotList.append(pRef) stampDict = {} tqPriorityList = [] sumPriority = 0.0 for tq in taskQueueDict: sumPriority += taskQueueDict[tq]["Priority"] tqPriorityList.append((tq, sumPriority)) tqDict = {} for pilotID in pilotList: rndm = random.random() * sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if tqID not in tqDict: tqDict[tqID] = [] tqDict[tqID].append(pilotID) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference(pilotList, tqID, "", "", self.localhost, "Cloud", stampDict) if not result["OK"]: self.log.error("Failed to insert pilots into the PilotAgentsDB: %s" % result["Message"]) self.log.info( "%d VMs submitted in total in this cycle, %d matched queues" % (totalSubmittedPilots, matchedQueues) ) return S_OK() def getVMInstances(self, endpoint, maxInstances): result = virtualMachineDB.getInstanceCounters("Status", {"Endpoint": endpoint}) if not result["OK"]: return result count = 0 for status in result["Value"]: if status in ["New", "Submitted", "Running"]: count += int(result["Value"][status]) return max(0, maxInstances - count)