class InputDataValidation( OptimizerExecutor ): """ The specific Optimizer must provide the following methods: - initializeOptimizer() before each execution cycle - checkJob() - the main method called for each job """ @classmethod def initializeOptimizer( cls ): """ Initialization of the Agent. """ random.seed() cls.__SEStatus = DictCache.DictCache() cls.__sitesForSE = DictCache.DictCache() try: from DIRAC.WorkloadManagementSystem.DB.JobDB import JobDB except ImportError, excp : return S_ERROR( "Could not import JobDB: %s" % str( excp ) ) try: cls.__jobDB = JobDB() except RuntimeError: return S_ERROR( "Cannot connect to JobDB" ) cls.__siteStatus = SiteStatus() cls.ex_setOption( "FailedStatus", "Input Data Not Available" ) return S_OK()
def __init__(self, pilotAgentsDB=None, jobDB=None, tqDB=None, jlDB=None, opsHelper=None): """ c'tor """ if pilotAgentsDB: self.pilotAgentsDB = pilotAgentsDB else: self.pilotAgentsDB = PilotAgentsDB() if jobDB: self.jobDB = jobDB else: self.jobDB = JobDB() if tqDB: self.tqDB = tqDB else: self.tqDB = TaskQueueDB() if jlDB: self.jlDB = jlDB else: self.jlDB = JobLoggingDB() if opsHelper: self.opsHelper = opsHelper else: self.opsHelper = Operations() self.log = gLogger.getSubLogger("Matcher") self.limiter = Limiter(jobDB=self.jobDB, opsHelper=self.opsHelper) self.siteClient = SiteStatus()
def initialize(self): """ Standard constructor """ self.am_setOption("PollingTime", 60.0) self.am_setOption("maxPilotWaitingHours", 6) self.queueDict = {} self.maxJobsInFillMode = MAX_JOBS_IN_FILLMODE self.maxPilotsToSubmit = MAX_PILOTS_TO_SUBMIT self.siteStatus = SiteStatus() return S_OK()
def _updateSiteMask(self, sitesData): siteStatus = SiteStatus() siteMaskStatus = dict(sitesData) for site in siteMaskStatus: # #FIXME: we are only taking into account ComputingAccess # if siteStatus.isUsableSite(site, 'ComputingAccess'): siteMaskStatus[site]['siteMaskStatus'] = 'Allowed' else: siteMaskStatus[site]['siteMaskStatus'] = 'Banned' sitesData[site]['siteMaskStatus'] = siteMaskStatus[site][ 'siteMaskStatus'] return S_OK(sitesData)
def initialize(self): """ Standard initialize. """ maxNumberOfThreads = self.am_getOption('maxNumberOfThreads', self.__maxNumberOfThreads) self.threadPool = ThreadPool(maxNumberOfThreads, maxNumberOfThreads) self.siteClient = SiteStatus() self.clients['SiteStatus'] = self.siteClient self.clients['ResourceManagementClient'] = ResourceManagementClient() return S_OK()
def __init__(self): """Internal initialization of the DIRAC Admin API.""" super(DiracAdmin, self).__init__() self.csAPI = CSAPI() self.dbg = False if gConfig.getValue(self.section + "/LogLevel", "DEBUG") == "DEBUG": self.dbg = True self.scratchDir = gConfig.getValue(self.section + "/ScratchDir", "/tmp") self.currentDir = os.getcwd() self.rssFlag = ResourceStatus().rssFlag self.sitestatus = SiteStatus()
def __init__(self): """Internal initialization of the DIRAC Admin API. """ super(DiracAdmin, self).__init__() self.csAPI = CSAPI() self.dbg = False if gConfig.getValue(self.section + '/LogLevel', 'DEBUG') == 'DEBUG': self.dbg = True self.scratchDir = gConfig.getValue(self.section + '/ScratchDir', '/tmp') self.currentDir = os.getcwd() self.rssFlag = ResourceStatus().rssFlag self.sitestatus = SiteStatus() self._siteSet = set(getSites().get('Value', []))
def printCEInfo(voName): resultQueues = Resources.getQueues(community=voName) if not resultQueues["OK"]: gLogger.error("Failed to get CE information") DIRACExit(-1) fields = ("Site", "CE", "CEType", "Queue", "Status") records = [] # get list of usable sites within this cycle resultMask = SiteStatus().getUsableSites() if not resultMask["OK"]: return resultMask siteMaskList = resultMask.get("Value", []) rssClient = ResourceStatus() for site in resultQueues["Value"]: siteStatus = "Active" if site in siteMaskList else "InActive" siteNew = True for ce in resultQueues["Value"][site]: ceStatus = siteStatus if rssClient.rssFlag: result = rssClient.getElementStatus(ce, "ComputingElement") if result["OK"]: ceStatus = result["Value"][ce]["all"] ceNew = True for queue in resultQueues["Value"][site][ce]["Queues"]: pSite = site if siteNew else "" pCE = "" ceType = "" if ceNew: pCE = ce ceType = resultQueues["Value"][site][ce]["CEType"] records.append((pSite, pCE, ceType, queue, ceStatus)) ceNew = False siteNew = False gLogger.notice( printTable(fields, records, printOut=False, columnSeparator=" ")) return S_OK()
def printCEInfo(voName): resultQueues = Resources.getQueues(community=voName) if not resultQueues['OK']: gLogger.error('Failed to get CE information') DIRACExit(-1) fields = ("Site", 'CE', 'CEType', 'Queue', 'Status') records = [] # get list of usable sites within this cycle resultMask = SiteStatus().getUsableSites() if not resultMask['OK']: return resultMask siteMaskList = resultMask.get('Value', []) rssClient = ResourceStatus() for site in resultQueues['Value']: siteStatus = "Active" if site in siteMaskList else "InActive" siteNew = True for ce in resultQueues['Value'][site]: ceStatus = siteStatus if rssClient.rssFlag: result = rssClient.getElementStatus(ce, "ComputingElement") if result['OK']: ceStatus = result['Value'][ce]['all'] ceNew = True for queue in resultQueues['Value'][site][ce]['Queues']: pSite = site if siteNew else '' pCE = '' ceType = '' if ceNew: pCE = ce ceType = resultQueues['Value'][site][ce]['CEType'] records.append((pSite, pCE, ceType, queue, ceStatus)) ceNew = False siteNew = False gLogger.notice( printTable(fields, records, printOut=False, columnSeparator=' ')) return S_OK()
def __init__( self ): """ Constructor, initializes the logger, rssClient and caches. examples >>> resourceStatus = ResourceStatus() """ super( ResourceStatus, self ).__init__() self.siteStatus = SiteStatus() # We can set CacheLifetime and CacheHistory from CS, so that we can tune them. cacheLifeTime = int( RssConfiguration().getConfigCache() ) # RSSCaches, one per elementType ( StorageElement, ComputingElement ) # Should be generated on the fly, instead of being hardcoded ? self.seCache = RSSCache( 'Storage', cacheLifeTime, self._updateSECache ) self.ceCache = RSSCache( 'Computing', cacheLifeTime, self._updateCECache )
def __init__( self, submitPool ): """ Define the logger and some defaults """ if submitPool == self.gridMiddleware: self.log = gLogger.getSubLogger( '%sPilotDirector' % self.gridMiddleware ) else: self.log = gLogger.getSubLogger( '%sPilotDirector/%s' % ( self.gridMiddleware, submitPool ) ) self.pilot = DIRAC_PILOT self.submitPoolOption = '-o /Resources/Computing/CEDefaults/SubmitPool=%s' % submitPool self.extraPilotOptions = [] self.installVersion = DIRAC_VERSION self.installProject = DIRAC_PROJECT self.installation = DIRAC_INSTALLATION self.pilotExtensionsList = [] self.virtualOrganization = VIRTUAL_ORGANIZATION self.install = DIRAC_INSTALL self.extraModules = DIRAC_MODULES self.maxJobsInFillMode = MAX_JOBS_IN_FILLMODE self.targetGrids = [ self.gridMiddleware ] self.enableListMatch = ENABLE_LISTMATCH self.listMatchDelay = LISTMATCH_DELAY self.listMatchCache = DictCache() self.privatePilotFraction = PRIVATE_PILOT_FRACTION self.errorClearTime = ERROR_CLEAR_TIME self.errorTicketTime = ERROR_TICKET_TIME self.errorMailAddress = DIRAC.errorMail self.alarmMailAddress = DIRAC.alarmMail self.mailFromAddress = FROM_MAIL self.siteClient = SiteStatus() if not 'log' in self.__dict__: self.log = gLogger.getSubLogger( 'PilotDirector' ) self.log.info( 'Initialized' )
def __checkSitesInMask(self, job, siteCandidates): """Returns list of site candidates that are in current mask. """ siteStatus = SiteStatus() result = siteStatus.getUsableSites('ComputingAccess') if not result['OK']: return S_ERROR('Could not get site mask') sites = [] usableSites = result['Value'] for candidate in siteCandidates: if not candidate in usableSites: self.log.verbose( '%s is a candidate site for job %s but not in mask' % (candidate, job)) else: sites.append(candidate) self.log.info('Candidate sites in Mask are %s' % (sites)) return S_OK(sites)
def getSiteMask(self, printOutput=False): """Retrieve current site mask from WMS Administrator service. Example usage: >>> print diracAdmin.getSiteMask() {'OK': True, 'Value': 0L} :returns: S_OK,S_ERROR """ siteStatus = SiteStatus() result = siteStatus.getUsableSites('ComputingAccess') if result['OK']: sites = result['Value'] if printOutput: sites.sort() for site in sites: print site return result
def getBannedSites(self, printOutput=False): """Retrieve current list of banned sites. Example usage: >>> print diracAdmin.getBannedSites() {'OK': True, 'Value': []} :returns: S_OK,S_ERROR """ siteStatus = SiteStatus() result = siteStatus.getUnusableSites('ComputingAccess') if not result['OK']: self.log.warn(result['Message']) return result bannedSites = result['Value'] bannedSites.sort() if printOutput: print '\n'.join(bannedSites) return S_OK(bannedSites)
def initialize(self): self.__opsHelper = self.__getOpsHelper() self.__limiter = Limiter(self.__opsHelper) self.__siteStatus = SiteStatus()
def main(): global fullMatch global sites Script.registerSwitch("F", "full-match", "Check all the matching criteria", setFullMatch) Script.registerSwitch( "S:", "site=", "Check matching for these sites (comma separated list)", setSites) Script.registerArgument("job_JDL: file with job JDL description") _, args = Script.parseCommandLine(ignoreErrors=True) from DIRAC.Core.Security.ProxyInfo import getVOfromProxyGroup from DIRAC.ConfigurationSystem.Client.Helpers import Resources from DIRAC.Core.Utilities.PrettyPrint import printTable from DIRAC.ResourceStatusSystem.Client.ResourceStatus import ResourceStatus from DIRAC.ResourceStatusSystem.Client.SiteStatus import SiteStatus from DIRAC.WorkloadManagementSystem.Utilities.QueueUtilities import getQueuesResolved, matchQueue with open(args[0]) as f: jdl = f.read() # Get the current VO result = getVOfromProxyGroup() if not result["OK"]: gLogger.error("No proxy found, please login") DIRACExit(-1) voName = result["Value"] resultQueues = Resources.getQueues(siteList=sites, community=voName) if not resultQueues["OK"]: gLogger.error("Failed to get CE information") DIRACExit(-1) siteDict = resultQueues["Value"] result = getQueuesResolved(siteDict, {}, checkPlatform=True) if not resultQueues["OK"]: gLogger.error("Failed to get CE information") DIRACExit(-1) queueDict = result["Value"] # get list of usable sites within this cycle resultMask = SiteStatus().getUsableSites() if not resultMask["OK"]: gLogger.error("Failed to get Site mask information") DIRACExit(-1) siteMaskList = resultMask.get("Value", []) rssClient = ResourceStatus() fields = ("Site", "CE", "Queue", "Status", "Match", "Reason") records = [] for queue, queueInfo in queueDict.items(): site = queueInfo["Site"] ce = queueInfo["CEName"] siteStatus = "Active" if site in siteMaskList else "InActive" ceStatus = siteStatus if rssClient.rssFlag: result = rssClient.getElementStatus(ce, "ComputingElement") if result["OK"]: ceStatus = result["Value"][ce]["all"] result = matchQueue(jdl, queueInfo["ParametersDict"], fullMatch=fullMatch) if not result["OK"]: gLogger.error("Failed in getting match data", result["Message"]) DIRACExit(-1) status = "Active" if siteStatus == "Active" and ceStatus == "Active" else "Inactive" if result["Value"]["Match"]: records.append( (site, ce, queueInfo["QueueName"], status, "Yes", "")) else: records.append((site, ce, queueInfo["QueueName"], status, "No", result["Value"]["Reason"])) gLogger.notice( printTable(fields, records, sortField="Site", columnSeparator=" ", printOut=False))
def getPilotSummaryWeb(self, selectDict, sortList, startItem, maxItems): """ Get summary of the pilot jobs status by CE/site in a standard structure """ stateNames = [ 'Submitted', 'Ready', 'Scheduled', 'Waiting', 'Running', 'Done', 'Aborted', 'Failed' ] allStateNames = stateNames + ['Done_Empty', 'Aborted_Hour'] paramNames = ['Site', 'CE'] + allStateNames last_update = None if 'LastUpdateTime' in selectDict: last_update = selectDict['LastUpdateTime'] del selectDict['LastUpdateTime'] site_select = [] if 'GridSite' in selectDict: site_select = selectDict['GridSite'] if not isinstance(site_select, list): site_select = [site_select] del selectDict['GridSite'] status_select = [] if 'Status' in selectDict: status_select = selectDict['Status'] if not isinstance(status_select, list): status_select = [status_select] del selectDict['Status'] expand_site = '' if 'ExpandSite' in selectDict: expand_site = selectDict['ExpandSite'] site_select = [expand_site] del selectDict['ExpandSite'] # Get all the data from the database with various selections result = self.getCounters('PilotAgents', ['GridSite', 'DestinationSite', 'Status'], selectDict, newer=last_update, timeStamp='LastUpdateTime') if not result['OK']: return result last_update = Time.dateTime() - Time.hour selectDict['Status'] = 'Aborted' resultHour = self.getCounters( 'PilotAgents', ['GridSite', 'DestinationSite', 'Status'], selectDict, newer=last_update, timeStamp='LastUpdateTime') if not resultHour['OK']: return resultHour last_update = Time.dateTime() - Time.day selectDict['Status'] = ['Aborted', 'Done'] resultDay = self.getCounters('PilotAgents', ['GridSite', 'DestinationSite', 'Status'], selectDict, newer=last_update, timeStamp='LastUpdateTime') if not resultDay['OK']: return resultDay selectDict['CurrentJobID'] = 0 selectDict['Status'] = 'Done' resultDayEmpty = self.getCounters( 'PilotAgents', ['GridSite', 'DestinationSite', 'Status'], selectDict, newer=last_update, timeStamp='LastUpdateTime') if not resultDayEmpty['OK']: return resultDayEmpty ceMap = {} resMap = getCESiteMapping() if resMap['OK']: ceMap = resMap['Value'] # Sort out different counters resultDict = {} resultDict['Unknown'] = {} for attDict, count in result['Value']: site = attDict['GridSite'] ce = attDict['DestinationSite'] state = attDict['Status'] if site == 'Unknown' and ce != "Unknown" and ce != "Multiple" and ce in ceMap: site = ceMap[ce] if site not in resultDict: resultDict[site] = {} if ce not in resultDict[site]: resultDict[site][ce] = {} for p in allStateNames: resultDict[site][ce][p] = 0 resultDict[site][ce][state] = count for attDict, count in resultDay['Value']: site = attDict['GridSite'] ce = attDict['DestinationSite'] state = attDict['Status'] if site == 'Unknown' and ce != "Unknown" and ce in ceMap: site = ceMap[ce] if state == "Done": resultDict[site][ce]["Done"] = count if state == "Aborted": resultDict[site][ce]["Aborted"] = count for attDict, count in resultDayEmpty['Value']: site = attDict['GridSite'] ce = attDict['DestinationSite'] state = attDict['Status'] if site == 'Unknown' and ce != "Unknown" and ce in ceMap: site = ceMap[ce] if state == "Done": resultDict[site][ce]["Done_Empty"] = count for attDict, count in resultHour['Value']: site = attDict['GridSite'] ce = attDict['DestinationSite'] state = attDict['Status'] if site == 'Unknown' and ce != "Unknown" and ce in ceMap: site = ceMap[ce] if state == "Aborted": resultDict[site][ce]["Aborted_Hour"] = count records = [] siteSumDict = {} for site in resultDict: sumDict = {} for state in allStateNames: if state not in sumDict: sumDict[state] = 0 sumDict['Total'] = 0 for ce in resultDict[site]: itemList = [site, ce] total = 0 for state in allStateNames: itemList.append(resultDict[site][ce][state]) sumDict[state] += resultDict[site][ce][state] if state == "Done": done = resultDict[site][ce][state] if state == "Done_Empty": empty = resultDict[site][ce][state] if state == "Aborted": aborted = resultDict[site][ce][state] if state != "Aborted_Hour" and state != "Done_Empty": total += resultDict[site][ce][state] sumDict['Total'] += total # Add the total number of pilots seen in the last day itemList.append(total) # Add pilot submission efficiency evaluation if (done - empty) > 0: eff = done / (done - empty) elif done == 0: eff = 0. elif empty == done: eff = 99. else: eff = 0. itemList.append('%.2f' % eff) # Add pilot job efficiency evaluation if total > 0: eff = (total - aborted) / total * 100 else: eff = 100. itemList.append('%.2f' % eff) # Evaluate the quality status of the CE if total > 10: if eff < 25.: itemList.append('Bad') elif eff < 60.: itemList.append('Poor') elif eff < 85.: itemList.append('Fair') else: itemList.append('Good') else: itemList.append('Idle') if len(resultDict[site]) == 1 or expand_site: records.append(itemList) if len(resultDict[site]) > 1 and not expand_site: itemList = [site, 'Multiple'] for state in allStateNames + ['Total']: if state in sumDict: itemList.append(sumDict[state]) else: itemList.append(0) done = sumDict["Done"] empty = sumDict["Done_Empty"] aborted = sumDict["Aborted"] total = sumDict["Total"] # Add pilot submission efficiency evaluation if (done - empty) > 0: eff = done / (done - empty) elif done == 0: eff = 0. elif empty == done: eff = 99. else: eff = 0. itemList.append('%.2f' % eff) # Add pilot job efficiency evaluation if total > 0: eff = (total - aborted) / total * 100 else: eff = 100. itemList.append('%.2f' % eff) # Evaluate the quality status of the Site if total > 10: if eff < 25.: itemList.append('Bad') elif eff < 60.: itemList.append('Poor') elif eff < 85.: itemList.append('Fair') else: itemList.append('Good') else: itemList.append('Idle') records.append(itemList) for state in allStateNames + ['Total']: if state not in siteSumDict: siteSumDict[state] = sumDict[state] else: siteSumDict[state] += sumDict[state] # Perform site selection if site_select: new_records = [] for r in records: if r[0] in site_select: new_records.append(r) records = new_records # Perform status selection if status_select: new_records = [] for r in records: if r[14] in status_select: new_records.append(r) records = new_records # Get the Site Mask data result = SiteStatus().getUsableSites() if result['OK']: siteMask = result['Value'] for r in records: if r[0] in siteMask: r.append('Yes') else: r.append('No') else: for r in records: r.append('Unknown') finalDict = {} finalDict['TotalRecords'] = len(records) finalDict['ParameterNames'] = paramNames + \ ['Total', 'PilotsPerJob', 'PilotJobEff', 'Status', 'InMask'] # Return all the records if maxItems == 0 or the specified number otherwise if maxItems: finalDict['Records'] = records[startItem:startItem + maxItems] else: finalDict['Records'] = records done = siteSumDict["Done"] empty = siteSumDict["Done_Empty"] aborted = siteSumDict["Aborted"] total = siteSumDict["Total"] # Add pilot submission efficiency evaluation if (done - empty) > 0: eff = done / (done - empty) elif done == 0: eff = 0. elif empty == done: eff = 99. else: eff = 0. siteSumDict['PilotsPerJob'] = '%.2f' % eff # Add pilot job efficiency evaluation if total > 0: eff = (total - aborted) / total * 100 else: eff = 100. siteSumDict['PilotJobEff'] = '%.2f' % eff # Evaluate the overall quality status if total > 100: if eff < 25.: siteSumDict['Status'] = 'Bad' elif eff < 60.: siteSumDict['Status'] = 'Poor' elif eff < 85.: siteSumDict['Status'] = 'Fair' else: siteSumDict['Status'] = 'Good' else: siteSumDict['Status'] = 'Idle' finalDict['Extras'] = siteSumDict return S_OK(finalDict)
def getPilotSummaryWeb(self, selectDict, sortList, startItem, maxItems): """Get summary of the pilot jobs status by CE/site in a standard structure""" allStateNames = PilotStatus.PILOT_STATES + [ "Done_Empty", "Aborted_Hour" ] paramNames = ["Site", "CE"] + allStateNames last_update = None if "LastUpdateTime" in selectDict: last_update = selectDict["LastUpdateTime"] del selectDict["LastUpdateTime"] site_select = [] if "GridSite" in selectDict: site_select = selectDict["GridSite"] if not isinstance(site_select, list): site_select = [site_select] del selectDict["GridSite"] status_select = [] if "Status" in selectDict: status_select = selectDict["Status"] if not isinstance(status_select, list): status_select = [status_select] del selectDict["Status"] expand_site = "" if "ExpandSite" in selectDict: expand_site = selectDict["ExpandSite"] site_select = [expand_site] del selectDict["ExpandSite"] # Get all the data from the database with various selections result = self.getCounters( "PilotAgents", ["GridSite", "DestinationSite", "Status"], selectDict, newer=last_update, timeStamp="LastUpdateTime", ) if not result["OK"]: return result last_update = Time.dateTime() - Time.hour selectDict["Status"] = PilotStatus.ABORTED resultHour = self.getCounters( "PilotAgents", ["GridSite", "DestinationSite", "Status"], selectDict, newer=last_update, timeStamp="LastUpdateTime", ) if not resultHour["OK"]: return resultHour last_update = Time.dateTime() - Time.day selectDict["Status"] = [PilotStatus.ABORTED, PilotStatus.DONE] resultDay = self.getCounters( "PilotAgents", ["GridSite", "DestinationSite", "Status"], selectDict, newer=last_update, timeStamp="LastUpdateTime", ) if not resultDay["OK"]: return resultDay selectDict["CurrentJobID"] = 0 selectDict["Status"] = PilotStatus.DONE resultDayEmpty = self.getCounters( "PilotAgents", ["GridSite", "DestinationSite", "Status"], selectDict, newer=last_update, timeStamp="LastUpdateTime", ) if not resultDayEmpty["OK"]: return resultDayEmpty ceMap = {} resMap = getCESiteMapping() if resMap["OK"]: ceMap = resMap["Value"] # Sort out different counters resultDict = {} resultDict["Unknown"] = {} for attDict, count in result["Value"]: site = attDict["GridSite"] ce = attDict["DestinationSite"] state = attDict["Status"] if site == "Unknown" and ce != "Unknown" and ce != "Multiple" and ce in ceMap: site = ceMap[ce] if site not in resultDict: resultDict[site] = {} if ce not in resultDict[site]: resultDict[site][ce] = {} for p in allStateNames: resultDict[site][ce][p] = 0 resultDict[site][ce][state] = count for attDict, count in resultDay["Value"]: site = attDict["GridSite"] ce = attDict["DestinationSite"] state = attDict["Status"] if site == "Unknown" and ce != "Unknown" and ce in ceMap: site = ceMap[ce] if state == PilotStatus.DONE: resultDict[site][ce][PilotStatus.DONE] = count if state == PilotStatus.ABORTED: resultDict[site][ce][PilotStatus.ABORTED] = count for attDict, count in resultDayEmpty["Value"]: site = attDict["GridSite"] ce = attDict["DestinationSite"] state = attDict["Status"] if site == "Unknown" and ce != "Unknown" and ce in ceMap: site = ceMap[ce] if state == PilotStatus.DONE: resultDict[site][ce]["Done_Empty"] = count for attDict, count in resultHour["Value"]: site = attDict["GridSite"] ce = attDict["DestinationSite"] state = attDict["Status"] if site == "Unknown" and ce != "Unknown" and ce in ceMap: site = ceMap[ce] if state == PilotStatus.ABORTED: resultDict[site][ce]["Aborted_Hour"] = count records = [] siteSumDict = {} for site in resultDict: sumDict = {} for state in allStateNames: if state not in sumDict: sumDict[state] = 0 sumDict["Total"] = 0 for ce in resultDict[site]: itemList = [site, ce] total = 0 for state in allStateNames: itemList.append(resultDict[site][ce][state]) sumDict[state] += resultDict[site][ce][state] if state == PilotStatus.DONE: done = resultDict[site][ce][state] if state == "Done_Empty": empty = resultDict[site][ce][state] if state == PilotStatus.ABORTED: aborted = resultDict[site][ce][state] if state != "Aborted_Hour" and state != "Done_Empty": total += resultDict[site][ce][state] sumDict["Total"] += total # Add the total number of pilots seen in the last day itemList.append(total) # Add pilot submission efficiency evaluation if (done - empty) > 0: eff = done / (done - empty) elif done == 0: eff = 0.0 elif empty == done: eff = 99.0 else: eff = 0.0 itemList.append("%.2f" % eff) # Add pilot job efficiency evaluation if total > 0: eff = (total - aborted) / total * 100 else: eff = 100.0 itemList.append("%.2f" % eff) # Evaluate the quality status of the CE if total > 10: if eff < 25.0: itemList.append("Bad") elif eff < 60.0: itemList.append("Poor") elif eff < 85.0: itemList.append("Fair") else: itemList.append("Good") else: itemList.append("Idle") if len(resultDict[site]) == 1 or expand_site: records.append(itemList) if len(resultDict[site]) > 1 and not expand_site: itemList = [site, "Multiple"] for state in allStateNames + ["Total"]: if state in sumDict: itemList.append(sumDict[state]) else: itemList.append(0) done = sumDict[PilotStatus.DONE] empty = sumDict["Done_Empty"] aborted = sumDict[PilotStatus.ABORTED] total = sumDict["Total"] # Add pilot submission efficiency evaluation if (done - empty) > 0: eff = done / (done - empty) elif done == 0: eff = 0.0 elif empty == done: eff = 99.0 else: eff = 0.0 itemList.append("%.2f" % eff) # Add pilot job efficiency evaluation if total > 0: eff = (total - aborted) / total * 100 else: eff = 100.0 itemList.append("%.2f" % eff) # Evaluate the quality status of the Site if total > 10: if eff < 25.0: itemList.append("Bad") elif eff < 60.0: itemList.append("Poor") elif eff < 85.0: itemList.append("Fair") else: itemList.append("Good") else: itemList.append("Idle") records.append(itemList) for state in allStateNames + ["Total"]: if state not in siteSumDict: siteSumDict[state] = sumDict[state] else: siteSumDict[state] += sumDict[state] # Perform site selection if site_select: new_records = [] for r in records: if r[0] in site_select: new_records.append(r) records = new_records # Perform status selection if status_select: new_records = [] for r in records: if r[14] in status_select: new_records.append(r) records = new_records # Get the Site Mask data result = SiteStatus().getUsableSites() if result["OK"]: siteMask = result["Value"] for r in records: if r[0] in siteMask: r.append("Yes") else: r.append("No") else: for r in records: r.append("Unknown") finalDict = {} finalDict["TotalRecords"] = len(records) finalDict["ParameterNames"] = paramNames + [ "Total", "PilotsPerJob", "PilotJobEff", "Status", "InMask" ] # Return all the records if maxItems == 0 or the specified number otherwise if maxItems: finalDict["Records"] = records[startItem:startItem + maxItems] else: finalDict["Records"] = records done = siteSumDict[PilotStatus.DONE] empty = siteSumDict["Done_Empty"] aborted = siteSumDict[PilotStatus.ABORTED] total = siteSumDict["Total"] # Add pilot submission efficiency evaluation if (done - empty) > 0: eff = done / (done - empty) elif done == 0: eff = 0.0 elif empty == done: eff = 99.0 else: eff = 0.0 siteSumDict["PilotsPerJob"] = "%.2f" % eff # Add pilot job efficiency evaluation if total > 0: eff = (total - aborted) / total * 100 else: eff = 100.0 siteSumDict["PilotJobEff"] = "%.2f" % eff # Evaluate the overall quality status if total > 100: if eff < 25.0: siteSumDict["Status"] = "Bad" elif eff < 60.0: siteSumDict["Status"] = "Poor" elif eff < 85.0: siteSumDict["Status"] = "Fair" else: siteSumDict["Status"] = "Good" else: siteSumDict["Status"] = "Idle" finalDict["Extras"] = siteSumDict return S_OK(finalDict)
DIRACExit(-1) voName = result['Value'] resultQueues = Resources.getQueues(siteList=sites, community=voName) if not resultQueues['OK']: gLogger.error('Failed to get CE information') DIRACExit(-1) siteDict = resultQueues['Value'] result = getQueuesResolved(siteDict) if not resultQueues['OK']: gLogger.error('Failed to get CE information') DIRACExit(-1) queueDict = result['Value'] # get list of usable sites within this cycle resultMask = SiteStatus().getUsableSites() if not resultMask['OK']: gLogger.error('Failed to get Site mask information') DIRACExit(-1) siteMaskList = resultMask.get('Value', []) rssClient = ResourceStatus() fields = ('Site', 'CE', 'Queue', 'Status', 'Match', 'Reason') records = [] for queue, queueInfo in queueDict.iteritems(): site = queueInfo['Site'] ce = queueInfo['CEName'] siteStatus = "Active" if site in siteMaskList else "InActive" ceStatus = siteStatus
def main(): global fullMatch global sites Script.registerSwitch("F", "full-match", "Check all the matching criteria", setFullMatch) Script.registerSwitch( "S:", "site=", "Check matching for these sites (comma separated list)", setSites) Script.parseCommandLine(ignoreErrors=True) args = Script.getPositionalArgs() if len(args) == 0: gLogger.error("Error: No job description provided") Script.showHelp(exitCode=1) from DIRAC.Core.Security.ProxyInfo import getVOfromProxyGroup from DIRAC.ConfigurationSystem.Client.Helpers import Resources from DIRAC.Core.Utilities.PrettyPrint import printTable from DIRAC.ResourceStatusSystem.Client.ResourceStatus import ResourceStatus from DIRAC.ResourceStatusSystem.Client.SiteStatus import SiteStatus from DIRAC.WorkloadManagementSystem.Utilities.QueueUtilities import getQueuesResolved, matchQueue with open(args[0]) as f: jdl = f.read() # Get the current VO result = getVOfromProxyGroup() if not result['OK']: gLogger.error('No proxy found, please login') DIRACExit(-1) voName = result['Value'] resultQueues = Resources.getQueues(siteList=sites, community=voName) if not resultQueues['OK']: gLogger.error('Failed to get CE information') DIRACExit(-1) siteDict = resultQueues['Value'] result = getQueuesResolved(siteDict) if not resultQueues['OK']: gLogger.error('Failed to get CE information') DIRACExit(-1) queueDict = result['Value'] # get list of usable sites within this cycle resultMask = SiteStatus().getUsableSites() if not resultMask['OK']: gLogger.error('Failed to get Site mask information') DIRACExit(-1) siteMaskList = resultMask.get('Value', []) rssClient = ResourceStatus() fields = ('Site', 'CE', 'Queue', 'Status', 'Match', 'Reason') records = [] for queue, queueInfo in queueDict.items(): site = queueInfo['Site'] ce = queueInfo['CEName'] siteStatus = "Active" if site in siteMaskList else "InActive" ceStatus = siteStatus if rssClient.rssFlag: result = rssClient.getElementStatus(ce, "ComputingElement") if result['OK']: ceStatus = result['Value'][ce]['all'] result = matchQueue(jdl, queueInfo, fullMatch=fullMatch) if not result['OK']: gLogger.error('Failed in getting match data', result['Message']) DIRACExit(-1) status = "Active" if siteStatus == "Active" and ceStatus == "Active" else "Inactive" if result['Value']['Match']: records.append((site, ce, queueInfo['Queue'], status, 'Yes', '')) else: records.append((site, ce, queueInfo['Queue'], status, 'No', result['Value']['Reason'])) gLogger.notice( printTable(fields, records, sortField='Site', columnSeparator=' ', printOut=False))
def initializeOptimizer(cls): """ Initialization of the optimizer. """ cls.siteClient = SiteStatus() cls.__jobDB = JobDB() return S_OK()
def initialize(self): self.siteClient = SiteStatus() return S_OK()
def checkJob(self, job, classAdJob): """This method controls the checking of the job. """ self.log.verbose('Job %s will be processed' % (job)) # Check if the job was recently rescheduled result = self.jobDB.getJobAttributes( job, ['RescheduleCounter', 'RescheduleTime', 'ApplicationStatus']) if not result['OK']: self.log.error(result['Message']) return S_ERROR('Can not get job attributes from JobDB') jobDict = result['Value'] reCounter = int(jobDict['RescheduleCounter']) if reCounter != 0: reTime = fromString(jobDict['RescheduleTime']) delta = toEpoch() - toEpoch(reTime) delay = self.maxRescheduleDelay if reCounter <= len(self.rescheduleDelaysList): delay = self.rescheduleDelaysList[reCounter - 1] if delta < delay: if jobDict['ApplicationStatus'].find( 'On Hold: after rescheduling') == -1: result = self.jobDB.setJobStatus( job, application='On Hold: after rescheduling #%d' % reCounter) return S_OK() # First, get Site and BannedSites from the Job result = self.__getJobSiteRequirement(job, classAdJob) userBannedSites = result['BannedSites'] userSites = result['Sites'] if userSites: userSites = applySiteRequirements(userSites, [], userBannedSites) if not userSites: msg = 'Impossible Site Requirement' return S_ERROR(msg) # Second, get the Active and Banned sites from the RSS siteStatus = SiteStatus() usableSites = siteStatus.getUsableSites('ComputingAccess') unusableSites = siteStatus.getUnusableSites('ComputingAccess') if not (usableSites['OK'] and unusableSites['OK']): if not usableSites['OK']: self.log.error(usableSites['Message']) if not unusableSites['OK']: self.log.error(unusableSites['Message']) return S_ERROR('Can not get Active and Banned Sites from JobDB') usableSites = usableSites['Value'] unusableSites = unusableSites['Value'] if userSites: sites = applySiteRequirements(userSites, usableSites, unusableSites) if not sites: # Put on Hold only non-excluded job types jobType = classAdJob.getAttributeString('JobType') if not jobType in self.excludedOnHoldJobTypes: msg = 'On Hold: Requested site is Banned or not Active' self.log.info(msg) result = self.jobDB.setJobStatus(job, application=msg) return S_OK() # Third, check if there is input data result = self.jobDB.getInputData(job) if not result['OK']: self.log.warn('Failed to get input data from JobDB for %s' % (job)) self.log.error(result['Message']) return S_ERROR('Failed to get input data from JobDB') if not result['Value']: return self.__sendJobToTaskQueue(job, classAdJob, userSites, userBannedSites) hasInputData = False inputData = [] for lfn in result['Value']: if lfn: inputData.append(lfn) hasInputData = True if not hasInputData: #With no input data requirement, job can proceed directly to task queue self.log.verbose('Job %s has no input data requirement' % (job)) return self.__sendJobToTaskQueue(job, classAdJob, userSites, userBannedSites) self.log.verbose('Job %s has an input data requirement ' % (job)) # Fourth, Check all optimizer information result = self.__checkOptimizerInfo(job) if not result['OK']: return result optInfo = result['Value'] #Compare site candidates with current mask optSites = optInfo['SiteCandidates'].keys() self.log.info('Input Data Site Candidates: %s' % (', '.join(optSites))) # Check that it is compatible with user requirements optSites = applySiteRequirements(optSites, userSites, userBannedSites) if not optSites: msg = 'Impossible Site + InputData Requirement' return S_ERROR(msg) sites = applySiteRequirements(optSites, usableSites, unusableSites) if not sites: msg = 'On Hold: InputData Site is Banned or not Active' self.log.info(msg) result = self.jobDB.setJobStatus(job, application=msg) return S_OK() #Set stager request as necessary, optimize for smallest #files on tape if #more than one site candidate left at this point checkStaging = self.__resolveSitesForStaging(job, sites, inputData, optInfo['SiteCandidates']) if not checkStaging['OK']: return checkStaging destinationSites = checkStaging['SiteCandidates'] if not destinationSites: return S_ERROR('No destination sites available') stagingFlag = checkStaging['Value'] if stagingFlag: #Single site candidate chosen and staging required self.log.verbose('Job %s requires staging of input data' % (job)) # set all LFN to disk for the selected site stagingSite = destinationSites[0] siteDict = optInfo['SiteCandidates'][stagingSite] siteDict['disk'] = siteDict['disk'] + siteDict['tape'] siteDict['tape'] = 0 optInfo['SiteCandidates'][stagingSite] = siteDict self.log.verbose( 'Updating %s Optimizer Info for Job %s:' % (self.dataAgentName, job), optInfo) result = self.setOptimizerJobInfo(job, self.dataAgentName, optInfo) if not result['OK']: return result # Site is selected for staging, report it self.log.verbose('Staging site candidate for job %s is %s' % (job, stagingSite)) result = self.__getStagingSites(stagingSite, destinationSites) if not result['OK']: stagingSites = [stagingSite] else: stagingSites = result['Value'] if len(stagingSites) == 1: self.jobDB.setJobAttribute(job, 'Site', stagingSite) else: # Get the name of the site group result = self.__getSiteGroup(stagingSites) if result['OK']: groupName = result['Value'] if groupName: self.jobDB.setJobAttribute(job, 'Site', groupName) else: self.jobDB.setJobAttribute(job, 'Site', 'Multiple') else: self.jobDB.setJobAttribute(job, 'Site', 'Multiple') stagerDict = self.__setStagingRequest(job, stagingSite, optInfo) if not stagerDict['OK']: return stagerDict self.__updateOtherSites(job, stagingSite, stagerDict['Value'], optInfo) return S_OK() else: #No staging required, can proceed to task queue agent and then waiting status self.log.verbose('Job %s does not require staging of input data' % (job)) #Finally send job to TaskQueueAgent return self.__sendJobToTaskQueue(job, classAdJob, destinationSites, userBannedSites)
def optimizeJob( self, jid, jobState ): # Reschedule delay result = jobState.getAttributes( [ 'RescheduleCounter', 'RescheduleTime', 'ApplicationStatus' ] ) if not result[ 'OK' ]: return result attDict = result[ 'Value' ] try: reschedules = int( attDict[ 'RescheduleCounter' ] ) except ValueError: return S_ERROR( "RescheduleCounter has to be an integer" ) if reschedules != 0: delays = self.ex_getOption( 'RescheduleDelays', [60, 180, 300, 600] ) delay = delays[ min( reschedules, len( delays ) - 1 ) ] waited = toEpoch() - toEpoch( fromString( attDict[ 'RescheduleTime' ] ) ) if waited < delay: return self.__holdJob( jobState, 'On Hold: after rescheduling %s' % reschedules, delay ) # Get site requirements result = self._getSitesRequired( jobState ) if not result[ 'OK' ]: return result userSites, userBannedSites = result[ 'Value' ] # Get active and banned sites from DIRAC siteStatus = SiteStatus() result = siteStatus.getUsableSites( 'ComputingAccess' ) if not result[ 'OK' ]: return S_ERROR( "Cannot retrieve active sites from JobDB" ) usableSites = result[ 'Value' ] result = siteStatus.getUnusableSites( 'ComputingAccess' ) if not result[ 'OK' ]: return S_ERROR( "Cannot retrieve banned sites from JobDB" ) unusableSites = result[ 'Value' ] # If the user has selected any site, filter them and hold the job if not able to run if userSites: result = jobState.getAttribute( "JobType" ) if not result[ 'OK' ]: return S_ERROR( "Could not retrieve job type" ) jobType = result[ 'Value' ] if jobType not in self.ex_getOption( 'ExcludedOnHoldJobTypes', [] ): sites = self._applySiteFilter( userSites, usableSites, unusableSites ) if not sites: return self.__holdJob( jobState, "Sites %s are inactive or banned" % ", ".join( userSites ) ) # Get the Input data # Third, check if there is input data result = jobState.getInputData() if not result['OK']: self.jobLog.error( "Cannot get input data %s" % ( result['Message'] ) ) return S_ERROR( 'Failed to get input data from JobDB' ) if not result['Value']: # No input data? Generate requirements and next return self.__sendToTQ( jobState, userSites, userBannedSites ) inputData = result[ 'Value' ] self.jobLog.verbose( 'Has an input data requirement' ) idAgent = self.ex_getOption( 'InputDataAgent', 'InputData' ) result = self.retrieveOptimizerParam( idAgent ) if not result['OK']: self.jobLog.error( "Could not retrieve input data info: %s" % result[ 'Message' ] ) return S_ERROR( "File Catalog Access Failure" ) opData = result[ 'Value' ] if 'SiteCandidates' not in opData: return S_ERROR( "No possible site candidates" ) # Filter input data sites with user requirement siteCandidates = list( opData[ 'SiteCandidates' ] ) self.jobLog.info( "Site candidates are %s" % siteCandidates ) siteCandidates = self._applySiteFilter( siteCandidates, userSites, userBannedSites ) if not siteCandidates: return S_ERROR( "Impossible InputData * Site requirements" ) idSites = {} for site in siteCandidates: idSites[ site ] = opData[ 'SiteCandidates' ][ site ] #Check if sites have correct count of disk+tape replicas numData = len( inputData ) errorSites = set() for site in idSites: if numData != idSites[ site ][ 'disk' ] + idSites[ site ][ 'tape' ]: self.jobLog.error( "Site candidate %s does not have all the input data" % site ) errorSites.add( site ) for site in errorSites: idSites.pop( site ) if not idSites: return S_ERROR( "Site candidates do not have all the input data" ) #Check if staging is required stageRequired, siteCandidates = self.__resolveStaging( jobState, inputData, idSites ) if not siteCandidates: return S_ERROR( "No destination sites available" ) # Is any site active? stageSites = self._applySiteFilter( siteCandidates, usableSites, unusableSites ) if not stageSites: return self.__holdJob( jobState, "Sites %s are inactive or banned" % ", ".join( siteCandidates ) ) # If no staging is required send to TQ if not stageRequired: # Use siteCandidates and not stageSites because active and banned sites # will be taken into account on matching time return self.__sendToTQ( jobState, siteCandidates, userBannedSites ) # Check if the user is allowed to stage if self.ex_getOption( "RestrictDataStage", False ): if not self.__checkStageAllowed( jobState ): return S_ERROR( "Stage not allowed" ) # Get stageSites[0] because it has already been randomized and it's as good as any in stageSites stageSite = stageSites[0] self.jobLog.verbose( " Staging site will be %s" % ( stageSite ) ) stageData = idSites[ stageSite ] # Set as if everything has already been staged stageData[ 'disk' ] += stageData[ 'tape' ] stageData[ 'tape' ] = 0 # Set the site info back to the original dict to save afterwards opData[ 'SiteCandidates' ][ stageSite ] = stageData result = self.__requestStaging( jobState, stageSite, opData ) if not result[ 'OK' ]: return result stageLFNs = result[ 'Value' ] self._updateSharedSESites( stageSite, stageLFNs, opData ) # Save the optimizer data again self.jobLog.verbose( 'Updating %s Optimizer Info:' % ( idAgent ), opData ) result = self.storeOptimizerParam( idAgent, opData ) if not result[ 'OK' ]: return result return self._setJobSite( jobState, stageSites )
def setUp(self): self.rsClient = ResourceStatusClient() self.stClient = SiteStatus() self.stClient.rssFlag = True
def _resolveCECandidates(self, taskQueueDict): """ Return a list of CEs for this TaskQueue """ # assume user knows what they're doing and avoid site mask e.g. sam jobs if 'GridCEs' in taskQueueDict and taskQueueDict['GridCEs']: self.log.info( 'CEs requested by TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join(taskQueueDict['GridCEs'])) return taskQueueDict['GridCEs'] # Get the mask siteStatus = SiteStatus() ret = siteStatus.getUsableSites('ComputingAccess') if not ret['OK']: self.log.error('Can not retrieve site Mask from DB:', ret['Message']) return [] usableSites = ret['Value'] if not usableSites: self.log.error('Site mask is empty') return [] self.log.verbose('Site Mask: %s' % ', '.join(usableSites)) # remove banned sites from siteMask if 'BannedSites' in taskQueueDict: for site in taskQueueDict['BannedSites']: if site in usableSites: usableSites.remove(site) self.log.verbose('Removing banned site %s from site Mask' % site) # remove from the mask if a Site is given siteMask = [ site for site in usableSites if 'Sites' not in taskQueueDict or site in taskQueueDict['Sites'] ] if not siteMask: # pilot can not be submitted self.log.info('No Valid Site Candidate in Mask for TaskQueue %s' % taskQueueDict['TaskQueueID']) return [] self.log.info( 'Site Candidates for TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join(siteMask)) # Get CE's associates to the given site Names ceMask = [] resources = Resources(vo=self.virtualOrganization) result = resources.getEligibleResources( 'Computing', { 'Site': siteMask, 'SubmissionMode': 'gLite', 'CEType': ['LCG', 'CREAM'] }) if not result['OK']: self.log.error("Failed to get eligible ce's:", result['Message']) return [] ces = result['Value'] for ce in ces: ceHost = resources.getComputingElementValue(ce, 'Host', 'unknown') if ceHost != 'unknown': ceMask.append(ceHost) if not ceMask: self.log.info( 'No CE Candidate found for TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join(siteMask)) self.log.verbose( 'CE Candidates for TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join(ceMask)) return ceMask