class TeamInfo(RESTEntity): """ This class need to move under WMStats server when wmstats server created """ def __init__(self, app, api, config, mount): # main CouchDB database where requests/workloads are stored RESTEntity.__init__(self, app, api, config, mount) wmstats_url = "%s/%s" % (self.config.couch_host, self.config.couch_wmstats_db) self.wmstats = WMStatsReader(wmstats_url) def validate(self, apiobj, method, api, param, safe): args_length = len(param.args) if args_length == 1: safe.args.append(param.args[0]) param.args.pop() return @restcall(formats = [('application/json', JSONFormat())]) @tools.expires(secs=-1) def get(self): result = self.wmstats.agentsByTeam(filterDrain=False) return rows(result.keys())
class TeamInfo(RESTEntity): """ This class need to move under WMStats server when wmstats server created """ def __init__(self, app, api, config, mount): # main CouchDB database where requests/workloads are stored RESTEntity.__init__(self, app, api, config, mount) wmstats_url = "%s/%s" % (self.config.couch_host, self.config.couch_wmstats_db) self.wmstats = WMStatsReader(wmstats_url) def validate(self, apiobj, method, api, param, safe): args_length = len(param.args) if args_length == 1: safe.args.append(param.args[0]) param.args.pop() return @restcall(formats=[('application/json', JSONFormat())]) @tools.expires(secs=-1) def get(self): result = self.wmstats.agentsByTeam(filterDrain=False) return rows(result)
class ResourceControlUpdater(BaseWorkerThread): """ Update site status and thresholds from SSB """ def __init__(self, config): """ Initialize """ BaseWorkerThread.__init__(self) self.config = config self.tasksCPU = ['Processing', 'Production'] self.tasksIO = ['Merge', 'Cleanup', 'Harvesting', 'LogCollect', 'Skim'] self.minCPUSlots = 50 self.minIOSlots = 25 # get dashboard url, set metric columns from config self.dashboard = config.AgentStatusWatcher.dashboard self.siteStatusMetric = config.AgentStatusWatcher.siteStatusMetric self.cpuBoundMetric = config.AgentStatusWatcher.cpuBoundMetric self.ioBoundMetric = config.AgentStatusWatcher.ioBoundMetric self.ssb = Dashboard(self.dashboard) # set pending percentages from config self.pendingSlotsSitePercent = config.AgentStatusWatcher.pendingSlotsSitePercent self.pendingSlotsTaskPercent = config.AgentStatusWatcher.pendingSlotsTaskPercent self.runningExpressPercent = config.AgentStatusWatcher.runningExpressPercent self.runningRepackPercent = config.AgentStatusWatcher.runningRepackPercent # sites forced to down self.forceSiteDown = getattr(config.AgentStatusWatcher, 'forceSiteDown', []) # agent team (for dynamic threshold) and queueParams (drain mode) self.teamName = config.Agent.teamName self.agentsNumByTeam = getattr(config.AgentStatusWatcher, 'defaultAgentsNumByTeam', 5) # only SSB sites self.onlySSB = config.AgentStatusWatcher.onlySSB # tier mode self.tier0Mode = hasattr(config, "Tier0Feeder") self.t1SitesCores = config.AgentStatusWatcher.t1SitesCores # switch this component on/off self.enabled = getattr(config.AgentStatusWatcher, 'enabled', True) # set resource control self.resourceControl = ResourceControl(config=self.config) # wmstats connection self.centralCouchDBReader = WMStatsReader( self.config.AgentStatusWatcher.centralWMStatsURL) @timeFunction def algorithm(self, parameters): """ _algorithm_ Update site state and thresholds, based on differences between resource control database and info available in SSB. 1. Get info from Resource Control database 2. Get info from SSB 3. Get information about teams and number of agents from WMStats 4. Change site state when needed (this triggers a condor clasAd fetch) 5. Change site thresholds when needed (and task thresholds) Sites from SSB are validated with PhEDEx node names """ if not self.enabled: logging.info( "This component is not enabled in the configuration. Doing nothing." ) return try: sitesRC = self.resourceControl.listSitesSlots() logging.debug("Info from resource control: %s", sitesRC) # first, update site status ssbSiteStatus = self.getSiteStatus() self.checkStatusChanges(sitesRC, ssbSiteStatus) # now fetch site slots thresholds sitesSSB = self.getInfoFromSSB() if not sitesSSB: logging.error( "One or more of the SSB metrics is down. Please contact the Dashboard team." ) return logging.debug("Info from SSB: %s", sitesSSB) # get number of agents working in the same team (not in DrainMode) self.getAgentsByTeam() # Check which site slots need to be updated in the database self.checkSlotsChanges(sitesRC, sitesSSB) except Exception as ex: logging.error("Error occurred, will retry later:") logging.error(str(ex)) logging.error("Trace back: \n%s", traceback.format_exc()) logging.info( "Resource control cycle finished updating site state and thresholds." ) def getAgentsByTeam(self): """ _getAgentsByTeam_ Get the WMStats view for agents and teams """ if isDrainMode(self.config): # maximize pending thresholds to get this agent drained ASAP self.agentsNumByTeam = 1 return agentsByTeam = {} try: agentsByTeam = self.centralCouchDBReader.agentsByTeam( filterDrain=True) except Exception: logging.error("WMStats is not available or is unresponsive.") if not agentsByTeam: logging.warning( "agentInfo couch view is not available, use default value %s", self.agentsNumByTeam) else: self.agentsNumByTeam = agentsByTeam.get(self.teamName, self.agentsNumByTeam) logging.debug( "Agents connected to the same team (not in DrainMode): %d", self.agentsNumByTeam) return def getInfoFromSSB(self): """ _getInfoFromSSB_ Get site status, CPU bound and IO bound from dashboard (SSB). Returns a dict of dicts where the first key is the site name. """ ssbCpuSlots = self.ssb.getMetric(self.cpuBoundMetric) ssbIoSlots = self.ssb.getMetric(self.ioBoundMetric) ssbSiteSlots = self.thresholdsByVOName(ssbCpuSlots, ssbIoSlots) return ssbSiteSlots def checkStatusChanges(self, infoRC, infoSSB): """ _checkStatusChanges_ Checks which sites need to have their site state updated in resource control, based on: 1. settings defined for the component (config.py) 2. site state changes between SSB and RC """ # First sets list of forced sites to down (HLT @FNAL is an example) for site in self.forceSiteDown: if site in infoRC and infoRC[site]['state'] != 'Down': logging.info("Forcing site %s to Down", site) self.updateSiteState(site, 'Down') infoSSB.pop(site, None) # if onlySSB sites, force all the sites not in SSB to down if self.onlySSB: for site in set(infoRC).difference(set(infoSSB)): if infoRC[site]['state'] != 'Down': logging.info('Only SSBsites, forcing site %s to Down', site) self.updateSiteState(site, 'Down') # normally set all the others for site in set(infoRC).intersection(set(infoSSB)): if infoRC[site]['state'] != infoSSB[site]['state']: logging.info('Changing %s state from %s to %s', site, infoRC[site]['state'], infoSSB[site]['state']) self.updateSiteState(site, infoSSB[site]['state']) return def checkSlotsChanges(self, infoRC, infoSSB): """ _checkSlotsChanges_ Checks which sites need to have their running and/or pending slots updated in resource control database, based on: 1. number of agents connected to the same team 2. and slots provided by the Dashboard team (SSB) If site slots are updated, then updates the task level too. """ logging.debug( "Settings for site and task pending slots: %s%% and %s%%", self.pendingSlotsSitePercent, self.pendingSlotsTaskPercent) for site in set(infoRC).intersection(set(infoSSB)): if self.tier0Mode and site.startswith('T1_'): # T1 cores utilization for Tier0 infoSSB[site]['slotsCPU'] *= self.t1SitesCores / 100 infoSSB[site]['slotsIO'] *= self.t1SitesCores / 100 else: # round very small sites to the bare minimum infoSSB[site]['slotsCPU'] = max(infoSSB[site]['slotsCPU'], self.minCPUSlots) infoSSB[site]['slotsIO'] = max(infoSSB[site]['slotsIO'], self.minIOSlots) CPUBound = infoSSB[site]['slotsCPU'] IOBound = infoSSB[site]['slotsIO'] sitePending = max( int(CPUBound / self.agentsNumByTeam * self.pendingSlotsSitePercent / 100), self.minCPUSlots) # update site slots, if needed if infoRC[site]['running_slots'] != CPUBound or infoRC[site][ 'pending_slots'] != sitePending: # Update site running and pending slots logging.info( "Updating %s site thresholds for pend/runn: %d/%d", site, sitePending, CPUBound) self.resourceControl.setJobSlotsForSite( site, pendingJobSlots=sitePending, runningJobSlots=CPUBound) # now handle the task level thresholds self.checkTaskSlotsChanges(site, CPUBound, IOBound) def thresholdsByVOName(self, infoCpu, infoIo): """ _thresholdsByVOName_ Creates a dictionary with CPU and IO slots keyed by the site name. If any of the thresholds is missing or has an invalid value, the whole site thresholds is skipped. """ ssbSiteSlots = {} for entry in infoCpu: if entry['Value'] is None: logging.warn( 'Site %s has invalid CPU thresholds in SSB. Taking no action', entry['VOName']) else: ssbSiteSlots[entry['VOName']] = { 'slotsCPU': int(entry['Value']) } # then iterate over the IO slots for entry in infoIo: if entry['Value'] is None: logging.warn( 'Site %s has invalid IO thresholds in SSB. Taking no action', entry['VOName']) else: ssbSiteSlots[entry['VOName']]['slotsIO'] = int(entry['Value']) # Before proceeding, remove sites without both metrics for site in ssbSiteSlots.keys(): if len(ssbSiteSlots[site]) != 2: logging.warn("Site: %s has incomplete SSB metrics, see %s", site, ssbSiteSlots[site]) ssbSiteSlots.pop(site) return ssbSiteSlots def getSiteStatus(self): """ _getSiteStatus_ Fetch site state from SSB and map it to agent state """ ssbState = self.ssb.getMetric(self.siteStatusMetric) ssbSiteState = {} for site in ssbState: voname = site['VOName'] status = site['Status'] if voname not in ssbSiteState: statusAgent = self.getState(str(status)) if not statusAgent: logging.error( "Unknown status '%s' for site %s, please check SSB", status, voname) else: ssbSiteState[voname] = {'state': statusAgent} else: logging.warning( 'I have a duplicated status entry in SSB for %s', voname) return ssbSiteState def getState(self, stateSSB): """ _getState_ Translates SSB states into resource control state """ ssb2agent = { 'enabled': 'Normal', 'drain': 'Draining', 'disabled': 'Down', 'test': 'Draining' } # 'test' state behaviour varies between production and tier0 agents ssb2agent['test'] = 'Normal' if self.tier0Mode else "Draining" return ssb2agent.get(stateSSB) def updateSiteState(self, siteName, state): """ _updateSiteState_ Update only the site state in the resource control database. """ try: self.resourceControl.changeSiteState(siteName, state) except Exception as ex: logging.error("Failed to update %s state to %s:", siteName, state) logging.error(str(ex)) logging.error("Traceback: \n%s", traceback.format_exc()) return def checkTaskSlotsChanges(self, siteName, CPUBound, IOBound): """ _checkTaskSlotsChanges_ Update the CPU and IOBound slots for a given site. """ siteTaskSlots = self.resourceControl.thresholdBySite(siteName) taskCPUPending = max( int(CPUBound / self.agentsNumByTeam * self.pendingSlotsTaskPercent / 100), self.minCPUSlots) taskIOPending = max( int(IOBound / self.agentsNumByTeam * self.pendingSlotsTaskPercent / 100), self.minIOSlots) updateTasks = False if siteTaskSlots[0]['task_type'] in self.tasksCPU and siteTaskSlots[0][ 'task_pending_slots'] != taskCPUPending: updateTasks = True elif siteTaskSlots[0]['task_type'] in self.tasksIO and siteTaskSlots[ 0]['task_pending_slots'] != taskIOPending: updateTasks = True if updateTasks: logging.info( "Updating %s CPU tasks thresholds for pend/runn: %d/%d", siteName, taskCPUPending, CPUBound) self.resourceControl.insertThreshold(siteName, taskType=self.tasksCPU, maxSlots=CPUBound, pendingSlots=taskCPUPending) logging.info( "Updating %s IO tasks thresholds for pend/runn: %d/%d", siteName, taskIOPending, IOBound) self.resourceControl.insertThreshold(siteName, taskType=self.tasksIO, maxSlots=IOBound, pendingSlots=taskIOPending) if self.tier0Mode: # Set task thresholds for Tier0 logging.debug("Updating %s Express and Repack task thresholds.", siteName) expressSlots = int(CPUBound * self.runningExpressPercent / 100) pendingExpress = int(expressSlots * self.pendingSlotsTaskPercent / 100) self.resourceControl.insertThreshold(siteName, 'Express', expressSlots, pendingExpress) repackSlots = int(CPUBound * self.runningRepackPercent / 100) pendingRepack = int(repackSlots * self.pendingSlotsTaskPercent / 100) self.resourceControl.insertThreshold(siteName, 'Repack', repackSlots, pendingRepack)
class ResourceControlUpdater(BaseWorkerThread): """ Update site status and thresholds from SSB """ def __init__(self, config): """ Initialize """ BaseWorkerThread.__init__(self) # set the workqueue service for REST call self.config = config self.setVariables(self.config) def setVariables(self, config): """ load all the variables from the config file """ # get dashboard url, set metric columns from config self.dashboard = config.AgentStatusWatcher.dashboard self.siteStatusMetric = config.AgentStatusWatcher.siteStatusMetric self.cpuBoundMetric = config.AgentStatusWatcher.cpuBoundMetric self.ioBoundMetric = config.AgentStatusWatcher.ioBoundMetric # set pending percentages from config self.pendingSlotsSitePercent = config.AgentStatusWatcher.pendingSlotsSitePercent self.pendingSlotsTaskPercent = config.AgentStatusWatcher.pendingSlotsTaskPercent self.runningExpressPercent = config.AgentStatusWatcher.runningExpressPercent self.runningRepackPercent = config.AgentStatusWatcher.runningRepackPercent # sites forced to down self.forceSiteDown = getattr(config.AgentStatusWatcher, 'forceSiteDown', []) # agent teams (for dynamic threshold) and queueParams (drain mode) self.teamNames = config.Agent.teamName self.agentsNumByTeam = getattr(config.AgentStatusWatcher, 'defaultAgentsNumByTeam', 5) # only SSB sites self.onlySSB = config.AgentStatusWatcher.onlySSB # tier mode self.tier0Mode = hasattr(config, "Tier0Feeder") self.t1SitesCores = config.AgentStatusWatcher.t1SitesCores # switch this component on/off self.enabled = getattr(config.AgentStatusWatcher, 'enabled', True) def setup(self, parameters): """ Set db connection and prepare resource control """ # Interface to WMBS/BossAir db myThread = threading.currentThread() # set resource control self.resourceControl = ResourceControl(config = self.config) # wmstats connection self.centralCouchDBReader = WMStatsReader(self.config.AgentStatusWatcher.centralWMStatsURL) def algorithm(self, parameters): """ _algorithm_ Update site state and thresholds, based on differences between resource control database and info available in SSB. 1. Get info from Resource Control database 2. Get info from SSB 3. Get information about teams and number of agents from WMStats 4. Change site state when needed (this triggers a condor clasAd fetch) 5. Change site thresholds when needed (and task thresholds) Sites from SSB are validated with PhEDEx node names """ # set variables every polling cycle self.setVariables(self.config) if not self.enabled: logging.info("This component is not enabled in the configuration. Doing nothing.") return try: sitesRC = self.resourceControl.listSitesSlots() logging.debug("Info from resource control: %s" % sitesRC) sitesSSB = self.getInfoFromSSB() if not sitesSSB: return logging.debug("Info from SSB: %s" % sitesSSB) # Check which site states need to be updated in the database sitesRC = self.checkStatusChanges(sitesRC, sitesSSB) # get number of agents working in the same team (not in DrainMode) self.getAgentsByTeam() # Check which site slots need to be updated in the database self.checkSlotsChanges(sitesRC, sitesSSB, self.agentsNumByTeam) except Exception as ex: logging.error("Error occurred, will retry later:") logging.error(str(ex)) logging.error("Trace back: \n%s" % traceback.format_exc()) logging.info("Resource control cycle finished updating site state and thresholds.") def getAgentsByTeam(self): """ _getAgentsByTeam_ Get the WMStats view about agents and teams """ agentsByTeam = {} try: agentsByTeam = self.centralCouchDBReader.agentsByTeam() except Exception as ex: logging.error("WMStats is not available or is unresponsive.") if not agentsByTeam: logging.debug("agentInfo couch view is not available, use default value %s" % self.agentsNumByTeam) else: self.agentsByTeam = agentsByTeam agentsCount = [] for team in self.teamNames.split(','): if team not in self.agentsByTeam: agentsCount.append(1) else: agentsCount.append(self.agentsByTeam[team]) # If agent is in several teams, we choose the team with less agents self.agentsNumByTeam = min(agentsCount, self.agentsNumByTeam) logging.debug("Agents connected to the same team (not in DrainMode): %d" % self.agentsNumByTeam) return def getInfoFromSSB(self): """ _getInfoFromSSB_ Get site status, CPU bound and IO bound from dashboard (SSB). Returns a dict of dicts where the first key is the site name. """ # urls from site status board url_site_state = self.dashboard + '/request.py/getplotdata?columnid=%s&batch=1&lastdata=1' % str(self.siteStatusMetric) url_cpu_bound = self.dashboard + '/request.py/getplotdata?columnid=%s&batch=1&lastdata=1' % str(self.cpuBoundMetric) url_io_bound = self.dashboard + '/request.py/getplotdata?columnid=%s&batch=1&lastdata=1' % str(self.ioBoundMetric) # get info from dashboard sites = urllib2.urlopen(url_site_state).read() cpu_bound = urllib2.urlopen(url_cpu_bound).read() io_bound = urllib2.urlopen(url_io_bound).read() # parse from json format to dictionary, get only 'csvdata' site_state = json.loads(sites)['csvdata'] cpu_slots = json.loads(cpu_bound)['csvdata'] io_slots = json.loads(io_bound)['csvdata'] # dictionaries with status/thresholds info by VOName stateBySite = self.siteStatusByVOName(site_state) slotsCPU = self.thresholdsByVOName(cpu_slots) slotsIO = self.thresholdsByVOName(io_slots) sitesSSB = {} if not stateBySite or not slotsCPU or not slotsIO: logging.error("One or more of the SSB metrics is down. Please contact the Dashboard team.") return sitesSSB for k,v in stateBySite.iteritems(): sitesSSB[k] = {'state': v} sitesSSB[k]['slotsCPU'] = slotsCPU[k] if k in slotsCPU else None sitesSSB[k]['slotsIO'] = slotsIO[k] if k in slotsIO else None return sitesSSB def checkStatusChanges(self, infoRC, infoSSB): """ _checkStatusChanges_ Checks which sites need to have their site state updated in resource control, based on: 1. settings defined for the component (config.py) 2. site state changes between SSB and RC Returns the new infoRC dict (where a few key/value pairs were deleted - no need to update slots information) """ # First sets list of forced sites to down (HLT @FNAL is an example) for site in self.forceSiteDown: if site in infoRC and infoRC[site]['state'] != 'Down': logging.info("Forcing site %s to Down" % site) self.updateSiteState(site, 'Down') infoRC.pop(site, None) # if onlySSB sites, force all the sites not in SSB to down if self.onlySSB: for site in set(infoRC).difference(set(infoSSB)): if infoRC[site]['state'] != 'Down': logging.info('Only SSBsites, forcing site %s to Down' % site) self.updateSiteState(site, 'Down') infoRC.pop(site, None) # this time don't update infoRC since we still want to update slots info for site in set(infoRC).intersection(set(infoSSB)): if infoRC[site]['state'] != infoSSB[site]['state']: logging.info('Changing %s state from %s to %s' % (site, infoRC[site]['state'], infoSSB[site]['state'])) self.updateSiteState(site, infoSSB[site]['state']) return infoRC def checkSlotsChanges(self, infoRC, infoSSB, agentsCount): """ _checkSlotsChanges_ Checks which sites need to have their running and/or pending slots updated in resource control database, based on: 1. number of agents connected to the same team 2. and slots provided by the Dashboard team (SSB) If site slots are updated, then also updates its tasks. """ tasksCPU = ['Processing', 'Production'] tasksIO = ['Merge', 'Cleanup', 'Harvesting', 'LogCollect', 'Skim'] minCPUSlots, minIOSlots = 50, 25 logging.debug("Settings for site and task pending slots: %s%% and %s%%" % (self.pendingSlotsSitePercent, self.pendingSlotsTaskPercent)) for site in set(infoRC).intersection(set(infoSSB)): if self.tier0Mode and 'T1_' in site: # T1 cores utilization for Tier0 infoSSB[site]['slotsCPU'] = infoSSB[site]['slotsCPU'] * self.t1SitesCores/100 infoSSB[site]['slotsIO'] = infoSSB[site]['slotsIO'] * self.t1SitesCores/100 # round very small sites to the bare minimum if infoSSB[site]['slotsCPU'] < minCPUSlots: infoSSB[site]['slotsCPU'] = minCPUSlots if infoSSB[site]['slotsIO'] < minIOSlots: infoSSB[site]['slotsIO'] = minIOSlots CPUBound = infoSSB[site]['slotsCPU'] IOBound = infoSSB[site]['slotsIO'] sitePending = max(int(CPUBound/agentsCount * self.pendingSlotsSitePercent/100), minCPUSlots) taskCPUPending = max(int(CPUBound/agentsCount * self.pendingSlotsTaskPercent/100), minCPUSlots) taskIOPending = max(int(IOBound/agentsCount * self.pendingSlotsTaskPercent/100), minIOSlots) if infoRC[site]['running_slots'] != CPUBound or infoRC[site]['pending_slots'] != sitePending: # Update site running and pending slots logging.debug("Updating %s site thresholds for pend/runn: %d/%d" % (site, sitePending, CPUBound)) self.resourceControl.setJobSlotsForSite(site, pendingJobSlots = sitePending, runningJobSlots = CPUBound) # Update site CPU tasks running and pending slots (large running slots) logging.debug("Updating %s tasksCPU thresholds for pend/runn: %d/%d" % (site, taskCPUPending, CPUBound)) for task in tasksCPU: self.resourceControl.insertThreshold(site, taskType = task, maxSlots = CPUBound, pendingSlots = taskCPUPending) # Update site IO tasks running and pending slots logging.debug("Updating %s tasksIO thresholds for pend/runn: %d/%d" % (site, taskIOPending, IOBound)) for task in tasksIO: self.resourceControl.insertThreshold(site, taskType = task, maxSlots = IOBound, pendingSlots = taskIOPending) if self.tier0Mode: # Set task thresholds for Tier0 logging.debug("Updating %s Express and Repack task thresholds." % site) expressSlots = int(CPUBound * self.runningExpressPercent/100) pendingExpress = int(expressSlots * self.pendingSlotsTaskPercent/100) self.resourceControl.insertThreshold(site, 'Express', expressSlots, pendingExpress) repackSlots = int(CPUBound * self.runningRepackPercent/100) pendingRepack = int(repackSlots * self.pendingSlotsTaskPercent/100) self.resourceControl.insertThreshold(site, 'Repack', repackSlots, pendingRepack) def thresholdsByVOName(self, sites): """ _thresholdsByVOName_ Creates a dictionary with keys->VOName and values->threshold: """ thresholdbyVOName = {} for site in sites: voname = site['VOName'] value = site['Value'] if voname not in thresholdbyVOName: if value is None: logging.warn('Site %s does not have thresholds in SSB, assuming 0' % voname) thresholdbyVOName[voname] = 0 else: thresholdbyVOName[voname] = int(value) else: logging.error('I have a duplicated threshold entry in SSB for %s' % voname) return thresholdbyVOName def siteStatusByVOName(self, sites): """ _siteStatusByVOName_ Creates a dictionary with keys->VOName and values->status: """ statusBySite = {} for site in sites: voname = site['VOName'] status = site['Status'] if not status: logging.error('Site %s does not have status in SSB' % voname) continue if voname not in statusBySite: statusAgent = self.getState(str(status)) if not statusAgent: logging.error("Unkwown status '%s' for site %s, please check SSB" % (status, voname)) continue statusBySite[voname] = statusAgent else: logging.error('I have a duplicated status entry in SSB for %s' % voname) return statusBySite def getState(self, stateSSB): """ _getState_ Translates SSB states into resource control state """ ssb2agent = {'on': 'Normal', 'drain': 'Draining', 'down': 'Down', 'skip': 'Down'} if stateSSB in ssb2agent: return ssb2agent[stateSSB] elif stateSSB == "tier0": logging.debug('There is a site in tier0 status (Tier0Mode is %s)' % self.tier0Mode ) if self.tier0Mode: return "Normal" else: return "Draining" else: return None def updateSiteState(self, siteName, state): """ _updateSiteState_ Update only the site state in the resource control database. """ try: self.resourceControl.changeSiteState(siteName, state) except Exception as ex: logging.error("Failed to update %s state to %s:" % (siteName, state)) logging.error(str(ex)) logging.error("Traceback: \n%s" % traceback.format_exc()) return
class ResourceControlUpdater(BaseWorkerThread): """ Update site status and thresholds from SSB """ def __init__(self, config): """ Initialize """ BaseWorkerThread.__init__(self) self.config = config # get dashboard url, set metric columns from config self.dashboard = config.AgentStatusWatcher.dashboard self.siteStatusMetric = config.AgentStatusWatcher.siteStatusMetric self.cpuBoundMetric = config.AgentStatusWatcher.cpuBoundMetric self.ioBoundMetric = config.AgentStatusWatcher.ioBoundMetric # set pending percentages from config self.pendingSlotsSitePercent = config.AgentStatusWatcher.pendingSlotsSitePercent self.pendingSlotsTaskPercent = config.AgentStatusWatcher.pendingSlotsTaskPercent self.runningExpressPercent = config.AgentStatusWatcher.runningExpressPercent self.runningRepackPercent = config.AgentStatusWatcher.runningRepackPercent # sites forced to down self.forceSiteDown = getattr(config.AgentStatusWatcher, 'forceSiteDown', []) # agent team (for dynamic threshold) and queueParams (drain mode) self.teamName = config.Agent.teamName self.agentsNumByTeam = getattr(config.AgentStatusWatcher, 'defaultAgentsNumByTeam', 5) # only SSB sites self.onlySSB = config.AgentStatusWatcher.onlySSB # tier mode self.tier0Mode = hasattr(config, "Tier0Feeder") self.t1SitesCores = config.AgentStatusWatcher.t1SitesCores # switch this component on/off self.enabled = getattr(config.AgentStatusWatcher, 'enabled', True) # set resource control self.resourceControl = ResourceControl(config=self.config) # wmstats connection self.centralCouchDBReader = WMStatsReader(self.config.AgentStatusWatcher.centralWMStatsURL) def algorithm(self, parameters): """ _algorithm_ Update site state and thresholds, based on differences between resource control database and info available in SSB. 1. Get info from Resource Control database 2. Get info from SSB 3. Get information about teams and number of agents from WMStats 4. Change site state when needed (this triggers a condor clasAd fetch) 5. Change site thresholds when needed (and task thresholds) Sites from SSB are validated with PhEDEx node names """ if not self.enabled: logging.info("This component is not enabled in the configuration. Doing nothing.") return try: sitesRC = self.resourceControl.listSitesSlots() logging.debug("Info from resource control: %s", sitesRC) sitesSSB = self.getInfoFromSSB() if not sitesSSB: return logging.debug("Info from SSB: %s", sitesSSB) # Check which site states need to be updated in the database sitesRC = self.checkStatusChanges(sitesRC, sitesSSB) # get number of agents working in the same team (not in DrainMode) self.getAgentsByTeam() # Check which site slots need to be updated in the database self.checkSlotsChanges(sitesRC, sitesSSB, self.agentsNumByTeam) except Exception as ex: logging.error("Error occurred, will retry later:") logging.error(str(ex)) logging.error("Trace back: \n%s", traceback.format_exc()) logging.info("Resource control cycle finished updating site state and thresholds.") def getAgentsByTeam(self): """ _getAgentsByTeam_ Get the WMStats view about agents and teams """ if isDrainMode(self.config): # maximize pending thresholds to get this agent drained ASAP self.agentsNumByTeam = 1 return agentsByTeam = {} try: agentsByTeam = self.centralCouchDBReader.agentsByTeam(filterDrain=True) except Exception: logging.error("WMStats is not available or is unresponsive.") if not agentsByTeam: logging.warning("agentInfo couch view is not available, use default value %s", self.agentsNumByTeam) else: self.agentsNumByTeam = agentsByTeam.get(self.teamName, self.agentsNumByTeam) logging.debug("Agents connected to the same team (not in DrainMode): %d", self.agentsNumByTeam) return def getInfoFromSSB(self): """ _getInfoFromSSB_ Get site status, CPU bound and IO bound from dashboard (SSB). Returns a dict of dicts where the first key is the site name. """ # urls from site status board urlSiteState = self.dashboard + '/request.py/getplotdata?columnid=%s&batch=1&lastdata=1' % str( self.siteStatusMetric) urlCpuBound = self.dashboard + '/request.py/getplotdata?columnid=%s&batch=1&lastdata=1' % str( self.cpuBoundMetric) urlIoBound = self.dashboard + '/request.py/getplotdata?columnid=%s&batch=1&lastdata=1' % str( self.ioBoundMetric) # get info from dashboard sites = urllib2.urlopen(urlSiteState).read() cpuBound = urllib2.urlopen(urlCpuBound).read() ioBound = urllib2.urlopen(urlIoBound).read() # parse from json format to dictionary, get only 'csvdata' ssbSiteState = json.loads(sites)['csvdata'] ssbCpuSlots = json.loads(cpuBound)['csvdata'] ssbIoSlots = json.loads(ioBound)['csvdata'] # dict updated by these methods with status/thresholds info keyed by the site name ssbSiteSlots = {} self.siteStatusByVOName(ssbSiteState, ssbSiteSlots) self.thresholdsByVOName(ssbCpuSlots, ssbSiteSlots, slotsType='slotsCPU') self.thresholdsByVOName(ssbIoSlots, ssbSiteSlots, slotsType='slotsIO') # Now remove sites with state only, such that no updates are applied to them ssbSiteSlots = {k: v for k, v in ssbSiteSlots.iteritems() if len(v) == 3} if not ssbSiteSlots: logging.error("One or more of the SSB metrics is down. Please contact the Dashboard team.") return ssbSiteSlots return ssbSiteSlots def checkStatusChanges(self, infoRC, infoSSB): """ _checkStatusChanges_ Checks which sites need to have their site state updated in resource control, based on: 1. settings defined for the component (config.py) 2. site state changes between SSB and RC Returns the new infoRC dict (where a few key/value pairs were deleted - no need to update slots information) """ # First sets list of forced sites to down (HLT @FNAL is an example) for site in self.forceSiteDown: if site in infoRC and infoRC[site]['state'] != 'Down': logging.info("Forcing site %s to Down", site) self.updateSiteState(site, 'Down') infoRC.pop(site, None) # if onlySSB sites, force all the sites not in SSB to down if self.onlySSB: for site in set(infoRC).difference(set(infoSSB)): if infoRC[site]['state'] != 'Down': logging.info('Only SSBsites, forcing site %s to Down', site) self.updateSiteState(site, 'Down') infoRC.pop(site, None) # this time don't update infoRC since we still want to update slots info for site in set(infoRC).intersection(set(infoSSB)): if infoRC[site]['state'] != infoSSB[site]['state']: logging.info('Changing %s state from %s to %s', site, infoRC[site]['state'], infoSSB[site]['state']) self.updateSiteState(site, infoSSB[site]['state']) return infoRC def checkSlotsChanges(self, infoRC, infoSSB, agentsCount): """ _checkSlotsChanges_ Checks which sites need to have their running and/or pending slots updated in resource control database, based on: 1. number of agents connected to the same team 2. and slots provided by the Dashboard team (SSB) If site slots are updated, then also updates its tasks. """ tasksCPU = ['Processing', 'Production'] tasksIO = ['Merge', 'Cleanup', 'Harvesting', 'LogCollect', 'Skim'] minCPUSlots, minIOSlots = 50, 25 logging.debug("Settings for site and task pending slots: %s%% and %s%%", self.pendingSlotsSitePercent, self.pendingSlotsTaskPercent) for site in set(infoRC).intersection(set(infoSSB)): if self.tier0Mode and 'T1_' in site: # T1 cores utilization for Tier0 infoSSB[site]['slotsCPU'] = infoSSB[site]['slotsCPU'] * self.t1SitesCores / 100 infoSSB[site]['slotsIO'] = infoSSB[site]['slotsIO'] * self.t1SitesCores / 100 # round very small sites to the bare minimum if infoSSB[site]['slotsCPU'] < minCPUSlots: infoSSB[site]['slotsCPU'] = minCPUSlots if infoSSB[site]['slotsIO'] < minIOSlots: infoSSB[site]['slotsIO'] = minIOSlots CPUBound = infoSSB[site]['slotsCPU'] IOBound = infoSSB[site]['slotsIO'] sitePending = max(int(CPUBound / agentsCount * self.pendingSlotsSitePercent / 100), minCPUSlots) taskCPUPending = max(int(CPUBound / agentsCount * self.pendingSlotsTaskPercent / 100), minCPUSlots) taskIOPending = max(int(IOBound / agentsCount * self.pendingSlotsTaskPercent / 100), minIOSlots) if infoRC[site]['running_slots'] != CPUBound or infoRC[site]['pending_slots'] != sitePending: # Update site running and pending slots logging.info("Updating %s site thresholds for pend/runn: %d/%d", site, sitePending, CPUBound) self.resourceControl.setJobSlotsForSite(site, pendingJobSlots=sitePending, runningJobSlots=CPUBound) # Update site CPU tasks running and pending slots (large running slots) logging.debug("Updating %s tasksCPU thresholds for pend/runn: %d/%d", site, taskCPUPending, CPUBound) for task in tasksCPU: self.resourceControl.insertThreshold(site, taskType=task, maxSlots=CPUBound, pendingSlots=taskCPUPending) # Update site IO tasks running and pending slots logging.debug("Updating %s tasksIO thresholds for pend/runn: %d/%d", site, taskIOPending, IOBound) for task in tasksIO: self.resourceControl.insertThreshold(site, taskType=task, maxSlots=IOBound, pendingSlots=taskIOPending) if self.tier0Mode: # Set task thresholds for Tier0 logging.debug("Updating %s Express and Repack task thresholds.", site) expressSlots = int(CPUBound * self.runningExpressPercent / 100) pendingExpress = int(expressSlots * self.pendingSlotsTaskPercent / 100) self.resourceControl.insertThreshold(site, 'Express', expressSlots, pendingExpress) repackSlots = int(CPUBound * self.runningRepackPercent / 100) pendingRepack = int(repackSlots * self.pendingSlotsTaskPercent / 100) self.resourceControl.insertThreshold(site, 'Repack', repackSlots, pendingRepack) def thresholdsByVOName(self, sites, ssbSiteSlots, slotsType): """ _thresholdsByVOName_ Updates the dict with CPU and IO slots, only for sites with a valid state """ for site in sites: voname = site['VOName'] value = site['Value'] if voname in ssbSiteSlots: if value is None: logging.warn('Site %s does not have thresholds in SSB. Taking no action', voname) # then we better remove this site from our final dict ssbSiteSlots.pop(voname) else: ssbSiteSlots[voname][slotsType] = int(value) else: logging.warn('Found %s thresholds for site %s which has no state in SSB', slotsType, voname) return def siteStatusByVOName(self, sites, ssbSiteSlots): """ _siteStatusByVOName_ Creates an inner dictionary for each site that will contain the site state and the number of slots """ for site in sites: voname = site['VOName'] status = site['Status'] if voname not in ssbSiteSlots: statusAgent = self.getState(str(status)) if not statusAgent: logging.error("Unkwown status '%s' for site %s, please check SSB", status, voname) else: ssbSiteSlots[voname] = {'state': statusAgent} else: logging.error('I have a duplicated status entry in SSB for %s', voname) return def getState(self, stateSSB): """ _getState_ Translates SSB states into resource control state """ ssb2agent = {'enabled': 'Normal', 'drain': 'Draining', 'disabled': 'Down', 'test': 'Draining'} # 'test' state behaviour varies between production and tier0 agents ssb2agent['test'] = 'Normal' if self.tier0Mode else "Draining" return ssb2agent.get(stateSSB) def updateSiteState(self, siteName, state): """ _updateSiteState_ Update only the site state in the resource control database. """ try: self.resourceControl.changeSiteState(siteName, state) except Exception as ex: logging.error("Failed to update %s state to %s:", siteName, state) logging.error(str(ex)) logging.error("Traceback: \n%s", traceback.format_exc()) return
class ResourceControlUpdater(BaseWorkerThread): """ Update site status and thresholds from SSB """ def __init__(self, config): """ Initialize """ BaseWorkerThread.__init__(self) # set the workqueue service for REST call self.config = config self.setVariables(self.config) def setVariables(self, config): """ load all the variables from the config file """ # get dashboard url, set metric columns from config self.dashboard = config.AgentStatusWatcher.dashboard self.siteStatusMetric = config.AgentStatusWatcher.siteStatusMetric self.cpuBoundMetric = config.AgentStatusWatcher.cpuBoundMetric self.ioBoundMetric = config.AgentStatusWatcher.ioBoundMetric # set pending percentages from config self.pendingSlotsSitePercent = config.AgentStatusWatcher.pendingSlotsSitePercent self.pendingSlotsTaskPercent = config.AgentStatusWatcher.pendingSlotsTaskPercent self.runningExpressPercentCPUBound = config.AgentStatusWatcher.runningExpressPercentCPUBound self.runningRepackPercentIOBound = config.AgentStatusWatcher.runningRepackPercentIOBound # forced site list self.forcedSiteList = config.AgentStatusWatcher.forcedSiteList # agent teams (for dynamic threshold) and queueParams (drain mode) self.teamNames = config.Agent.teamName self.queueParams = config.WorkQueueManager.queueParams # only SSB sites self.onlySSB = config.AgentStatusWatcher.onlySSB # tier mode self.tier0Mode = hasattr(config, "Tier0Feeder") def setup(self, parameters): """ Set db connection and prepare resource control """ # Interface to WMBS/BossAir db myThread = threading.currentThread() # set resource control self.resourceControl = ResourceControl(config = self.config) # wmstats connection self.centralCouchDBReader = WMStatsReader(self.config.AgentStatusWatcher.centralWMStatsURL) # init variables self.agentsByTeam = {} def algorithm(self, parameters): """ _algorithm_ Update site info about state and thresholds 1. Get information from SSB 2. Get information about teams and agents from WMStats 3. Set site status and set therholds for each valid site Sites from SSB are validated with PhEDEx node names """ try: # set variables every polling cycle self.setVariables(self.config) # Get sites in Resource Control currentSites = self.resourceControl.listCurrentSites() logging.debug("Starting algorithm, getting site info from SSB") stateBySite, slotsCPU, slotsIO = self.getInfoFromSSB() if not stateBySite or not slotsCPU or not slotsIO: logging.error("One or more of the SSB metrics is down. Please contact the Dashboard team.") return logging.debug("Setting status and thresholds for all sites, site pending: %s%%, task pending: %s%%" % (str(self.pendingSlotsSitePercent), str(self.pendingSlotsTaskPercent))) if self.queueParams.get('DrainMode', False): agentsNum = 1 logging.debug("This agent is in DrainMode, don't divide pending thresholds") else: # get number of agents working in the same team (not in DrainMode) agentsByTeam = self.centralCouchDBReader.agentsByTeam() if not agentsByTeam: agentsNum = 1 logging.debug("agentInfo couch view is not available, don't divide pending thresholds") else: self.agentsByTeam = agentsByTeam teams = self.teamNames.split(',') agentsCount = [] for team in teams: if self.agentsByTeam[team] == 0: agentsCount.append(1) else: agentsCount.append(self.agentsByTeam[team]) agentsNum = min(agentsCount) # If agent is in several teams, we choose the team with less agents logging.debug("Number of agents not in DrainMode running in the same team: %s" % str(agentsNum)) # set site status and thresholds listSites = stateBySite.keys() if self.forcedSiteList: if set(self.forcedSiteList).issubset(set(listSites)): listSites = self.forcedSiteList logging.info("Forcing site list: %s" % (', '.join(self.forcedSiteList))) else: listSites = self.forcedSiteList logging.warn("Forcing site list: %s. Some site(s) are not in SSB" % (', '.join(self.forcedSiteList))) for site in listSites: if site in currentSites: sitestate = stateBySite.get(site,'Normal') if not slotsCPU[site] or not slotsIO[site]: pluginResponse = self.updateSiteInfo(site, sitestate, 0, 0, agentsNum) if not pluginResponse: continue logging.error('Setting site %s to %s, forcing CPUBound: 0, IOBound: 0 due to missing information in SSB' % (site, sitestate)) continue pluginResponse = self.updateSiteInfo(site, sitestate, slotsCPU[site], slotsIO[site], agentsNum) if not pluginResponse: continue logging.info('Setting site %s to %s, CPUBound: %s, IOBound: %s' % (site, sitestate, slotsCPU[site], slotsIO[site])) else: logging.debug("Site '%s' has not been added to Resource Control" % site) # if onlySSB sites or forcedSiteList, force to down all the sites not in SSB/forcedSiteList if self.onlySSB or self.forcedSiteList: for site in set(currentSites).difference(set(listSites)): pluginResponse = self.updateSiteInfo(site, 'Down', 0, 0) if not pluginResponse: continue logging.info('Only SSBsites/forcedSiteList, forcing site %s to Down, CPUBound: 0, IOBound: 0' % site) logging.info("Resource update is completed, waiting for the next cycle.\n") except Exception, ex: logging.error("Error occurred, will retry later:") logging.error(str(ex)) logging.error("Trace back: \n%s" % traceback.format_exc())
class ResourceControlUpdater(BaseWorkerThread): """ Update site status and thresholds from SSB """ def __init__(self, config): """ Initialize """ BaseWorkerThread.__init__(self) self.config = config # get dashboard url, set metric columns from config self.dashboard = config.AgentStatusWatcher.dashboard self.siteStatusMetric = config.AgentStatusWatcher.siteStatusMetric self.cpuBoundMetric = config.AgentStatusWatcher.cpuBoundMetric self.ioBoundMetric = config.AgentStatusWatcher.ioBoundMetric # set pending percentages from config self.pendingSlotsSitePercent = config.AgentStatusWatcher.pendingSlotsSitePercent self.pendingSlotsTaskPercent = config.AgentStatusWatcher.pendingSlotsTaskPercent self.runningExpressPercent = config.AgentStatusWatcher.runningExpressPercent self.runningRepackPercent = config.AgentStatusWatcher.runningRepackPercent # sites forced to down self.forceSiteDown = getattr(config.AgentStatusWatcher, 'forceSiteDown', []) # agent team (for dynamic threshold) and queueParams (drain mode) self.teamName = config.Agent.teamName self.agentsNumByTeam = getattr(config.AgentStatusWatcher, 'defaultAgentsNumByTeam', 5) # only SSB sites self.onlySSB = config.AgentStatusWatcher.onlySSB # tier mode self.tier0Mode = hasattr(config, "Tier0Feeder") self.t1SitesCores = config.AgentStatusWatcher.t1SitesCores # switch this component on/off self.enabled = getattr(config.AgentStatusWatcher, 'enabled', True) # set resource control self.resourceControl = ResourceControl(config=self.config) # wmstats connection self.centralCouchDBReader = WMStatsReader( self.config.AgentStatusWatcher.centralWMStatsURL) def algorithm(self, parameters): """ _algorithm_ Update site state and thresholds, based on differences between resource control database and info available in SSB. 1. Get info from Resource Control database 2. Get info from SSB 3. Get information about teams and number of agents from WMStats 4. Change site state when needed (this triggers a condor clasAd fetch) 5. Change site thresholds when needed (and task thresholds) Sites from SSB are validated with PhEDEx node names """ if not self.enabled: logging.info( "This component is not enabled in the configuration. Doing nothing." ) return try: sitesRC = self.resourceControl.listSitesSlots() logging.debug("Info from resource control: %s", sitesRC) sitesSSB = self.getInfoFromSSB() if not sitesSSB: return logging.debug("Info from SSB: %s", sitesSSB) # Check which site states need to be updated in the database sitesRC = self.checkStatusChanges(sitesRC, sitesSSB) # get number of agents working in the same team (not in DrainMode) self.getAgentsByTeam() # Check which site slots need to be updated in the database self.checkSlotsChanges(sitesRC, sitesSSB, self.agentsNumByTeam) except Exception as ex: logging.error("Error occurred, will retry later:") logging.error(str(ex)) logging.error("Trace back: \n%s", traceback.format_exc()) logging.info( "Resource control cycle finished updating site state and thresholds." ) def getAgentsByTeam(self): """ _getAgentsByTeam_ Get the WMStats view about agents and teams """ if isDrainMode(self.config): # maximize pending thresholds to get this agent drained ASAP self.agentsNumByTeam = 1 return agentsByTeam = {} try: agentsByTeam = self.centralCouchDBReader.agentsByTeam( filterDrain=True) except Exception: logging.error("WMStats is not available or is unresponsive.") if not agentsByTeam: logging.warning( "agentInfo couch view is not available, use default value %s", self.agentsNumByTeam) else: self.agentsNumByTeam = agentsByTeam.get(self.teamName, self.agentsNumByTeam) logging.debug( "Agents connected to the same team (not in DrainMode): %d", self.agentsNumByTeam) return def getInfoFromSSB(self): """ _getInfoFromSSB_ Get site status, CPU bound and IO bound from dashboard (SSB). Returns a dict of dicts where the first key is the site name. """ # urls from site status board urlSiteState = self.dashboard + '/request.py/getplotdata?columnid=%s&batch=1&lastdata=1' % str( self.siteStatusMetric) urlCpuBound = self.dashboard + '/request.py/getplotdata?columnid=%s&batch=1&lastdata=1' % str( self.cpuBoundMetric) urlIoBound = self.dashboard + '/request.py/getplotdata?columnid=%s&batch=1&lastdata=1' % str( self.ioBoundMetric) # get info from dashboard sites = urllib2.urlopen(urlSiteState).read() cpuBound = urllib2.urlopen(urlCpuBound).read() ioBound = urllib2.urlopen(urlIoBound).read() # parse from json format to dictionary, get only 'csvdata' ssbSiteState = json.loads(sites)['csvdata'] ssbCpuSlots = json.loads(cpuBound)['csvdata'] ssbIoSlots = json.loads(ioBound)['csvdata'] # dict updated by these methods with status/thresholds info keyed by the site name ssbSiteSlots = {} self.siteStatusByVOName(ssbSiteState, ssbSiteSlots) self.thresholdsByVOName(ssbCpuSlots, ssbSiteSlots, slotsType='slotsCPU') self.thresholdsByVOName(ssbIoSlots, ssbSiteSlots, slotsType='slotsIO') # Now remove sites with state only, such that no updates are applied to them ssbSiteSlots = { k: v for k, v in ssbSiteSlots.iteritems() if len(v) == 3 } if not ssbSiteSlots: logging.error( "One or more of the SSB metrics is down. Please contact the Dashboard team." ) return ssbSiteSlots return ssbSiteSlots def checkStatusChanges(self, infoRC, infoSSB): """ _checkStatusChanges_ Checks which sites need to have their site state updated in resource control, based on: 1. settings defined for the component (config.py) 2. site state changes between SSB and RC Returns the new infoRC dict (where a few key/value pairs were deleted - no need to update slots information) """ # First sets list of forced sites to down (HLT @FNAL is an example) for site in self.forceSiteDown: if site in infoRC and infoRC[site]['state'] != 'Down': logging.info("Forcing site %s to Down", site) self.updateSiteState(site, 'Down') infoRC.pop(site, None) # if onlySSB sites, force all the sites not in SSB to down if self.onlySSB: for site in set(infoRC).difference(set(infoSSB)): if infoRC[site]['state'] != 'Down': logging.info('Only SSBsites, forcing site %s to Down', site) self.updateSiteState(site, 'Down') infoRC.pop(site, None) # this time don't update infoRC since we still want to update slots info for site in set(infoRC).intersection(set(infoSSB)): if infoRC[site]['state'] != infoSSB[site]['state']: logging.info('Changing %s state from %s to %s', site, infoRC[site]['state'], infoSSB[site]['state']) self.updateSiteState(site, infoSSB[site]['state']) return infoRC def checkSlotsChanges(self, infoRC, infoSSB, agentsCount): """ _checkSlotsChanges_ Checks which sites need to have their running and/or pending slots updated in resource control database, based on: 1. number of agents connected to the same team 2. and slots provided by the Dashboard team (SSB) If site slots are updated, then also updates its tasks. """ tasksCPU = ['Processing', 'Production'] tasksIO = ['Merge', 'Cleanup', 'Harvesting', 'LogCollect', 'Skim'] minCPUSlots, minIOSlots = 50, 25 logging.debug( "Settings for site and task pending slots: %s%% and %s%%", self.pendingSlotsSitePercent, self.pendingSlotsTaskPercent) for site in set(infoRC).intersection(set(infoSSB)): if self.tier0Mode and 'T1_' in site: # T1 cores utilization for Tier0 infoSSB[site]['slotsCPU'] = infoSSB[site][ 'slotsCPU'] * self.t1SitesCores / 100 infoSSB[site]['slotsIO'] = infoSSB[site][ 'slotsIO'] * self.t1SitesCores / 100 # round very small sites to the bare minimum if infoSSB[site]['slotsCPU'] < minCPUSlots: infoSSB[site]['slotsCPU'] = minCPUSlots if infoSSB[site]['slotsIO'] < minIOSlots: infoSSB[site]['slotsIO'] = minIOSlots CPUBound = infoSSB[site]['slotsCPU'] IOBound = infoSSB[site]['slotsIO'] sitePending = max( int(CPUBound / agentsCount * self.pendingSlotsSitePercent / 100), minCPUSlots) taskCPUPending = max( int(CPUBound / agentsCount * self.pendingSlotsTaskPercent / 100), minCPUSlots) taskIOPending = max( int(IOBound / agentsCount * self.pendingSlotsTaskPercent / 100), minIOSlots) if infoRC[site]['running_slots'] != CPUBound or infoRC[site][ 'pending_slots'] != sitePending: # Update site running and pending slots logging.info( "Updating %s site thresholds for pend/runn: %d/%d", site, sitePending, CPUBound) self.resourceControl.setJobSlotsForSite( site, pendingJobSlots=sitePending, runningJobSlots=CPUBound) # Update site CPU tasks running and pending slots (large running slots) logging.debug( "Updating %s tasksCPU thresholds for pend/runn: %d/%d", site, taskCPUPending, CPUBound) for task in tasksCPU: self.resourceControl.insertThreshold( site, taskType=task, maxSlots=CPUBound, pendingSlots=taskCPUPending) # Update site IO tasks running and pending slots logging.debug( "Updating %s tasksIO thresholds for pend/runn: %d/%d", site, taskIOPending, IOBound) for task in tasksIO: self.resourceControl.insertThreshold( site, taskType=task, maxSlots=IOBound, pendingSlots=taskIOPending) if self.tier0Mode: # Set task thresholds for Tier0 logging.debug( "Updating %s Express and Repack task thresholds.", site) expressSlots = int(CPUBound * self.runningExpressPercent / 100) pendingExpress = int(expressSlots * self.pendingSlotsTaskPercent / 100) self.resourceControl.insertThreshold(site, 'Express', expressSlots, pendingExpress) repackSlots = int(CPUBound * self.runningRepackPercent / 100) pendingRepack = int(repackSlots * self.pendingSlotsTaskPercent / 100) self.resourceControl.insertThreshold(site, 'Repack', repackSlots, pendingRepack) def thresholdsByVOName(self, sites, ssbSiteSlots, slotsType): """ _thresholdsByVOName_ Updates the dict with CPU and IO slots, only for sites with a valid state """ for site in sites: voname = site['VOName'] value = site['Value'] if voname in ssbSiteSlots: if value is None: logging.warn( 'Site %s does not have thresholds in SSB. Taking no action', voname) # then we better remove this site from our final dict ssbSiteSlots.pop(voname) else: ssbSiteSlots[voname][slotsType] = int(value) else: logging.warn( 'Found %s thresholds for site %s which has no state in SSB', slotsType, voname) return def siteStatusByVOName(self, sites, ssbSiteSlots): """ _siteStatusByVOName_ Creates an inner dictionary for each site that will contain the site state and the number of slots """ for site in sites: voname = site['VOName'] status = site['Status'] if voname not in ssbSiteSlots: statusAgent = self.getState(str(status)) if not statusAgent: logging.error( "Unkwown status '%s' for site %s, please check SSB", status, voname) else: ssbSiteSlots[voname] = {'state': statusAgent} else: logging.error('I have a duplicated status entry in SSB for %s', voname) return def getState(self, stateSSB): """ _getState_ Translates SSB states into resource control state """ ssb2agent = { 'enabled': 'Normal', 'drain': 'Draining', 'disabled': 'Down', 'test': 'Draining' } # 'test' state behaviour varies between production and tier0 agents ssb2agent['test'] = 'Normal' if self.tier0Mode else "Draining" return ssb2agent.get(stateSSB) def updateSiteState(self, siteName, state): """ _updateSiteState_ Update only the site state in the resource control database. """ try: self.resourceControl.changeSiteState(siteName, state) except Exception as ex: logging.error("Failed to update %s state to %s:", siteName, state) logging.error(str(ex)) logging.error("Traceback: \n%s", traceback.format_exc()) return
class ResourceControlUpdater(BaseWorkerThread): """ Update site status and thresholds from SSB """ def __init__(self, config): """ Initialize """ BaseWorkerThread.__init__(self) self.config = config self.tasksCPU = ['Processing', 'Production'] self.tasksIO = ['Merge', 'Cleanup', 'Harvesting', 'LogCollect', 'Skim'] self.minCPUSlots = 50 self.minIOSlots = 25 # get dashboard url, set metric columns from config self.dashboard = config.AgentStatusWatcher.dashboard self.siteStatusMetric = config.AgentStatusWatcher.siteStatusMetric self.cpuBoundMetric = config.AgentStatusWatcher.cpuBoundMetric self.ioBoundMetric = config.AgentStatusWatcher.ioBoundMetric self.ssb = Dashboard(self.dashboard) # set pending percentages from config self.pendingSlotsSitePercent = config.AgentStatusWatcher.pendingSlotsSitePercent self.pendingSlotsTaskPercent = config.AgentStatusWatcher.pendingSlotsTaskPercent self.runningExpressPercent = config.AgentStatusWatcher.runningExpressPercent self.runningRepackPercent = config.AgentStatusWatcher.runningRepackPercent # sites forced to down self.forceSiteDown = getattr(config.AgentStatusWatcher, 'forceSiteDown', []) # agent team (for dynamic threshold) and queueParams (drain mode) self.teamName = config.Agent.teamName self.agentsNumByTeam = getattr(config.AgentStatusWatcher, 'defaultAgentsNumByTeam', 5) # only SSB sites self.onlySSB = config.AgentStatusWatcher.onlySSB # tier mode self.tier0Mode = hasattr(config, "Tier0Feeder") self.t1SitesCores = config.AgentStatusWatcher.t1SitesCores # switch this component on/off self.enabled = getattr(config.AgentStatusWatcher, 'enabled', True) # set resource control self.resourceControl = ResourceControl(config=self.config) # wmstats connection self.centralCouchDBReader = WMStatsReader(self.config.AgentStatusWatcher.centralWMStatsURL) @timeFunction def algorithm(self, parameters): """ _algorithm_ Update site state and thresholds, based on differences between resource control database and info available in SSB. 1. Get info from Resource Control database 2. Get info from SSB 3. Get information about teams and number of agents from WMStats 4. Change site state when needed (this triggers a condor clasAd fetch) 5. Change site thresholds when needed (and task thresholds) Sites from SSB are validated with PhEDEx node names """ if not self.enabled: logging.info("This component is not enabled in the configuration. Doing nothing.") return try: sitesRC = self.resourceControl.listSitesSlots() logging.debug("Info from resource control: %s", sitesRC) # first, update site status ssbSiteStatus = self.getSiteStatus() self.checkStatusChanges(sitesRC, ssbSiteStatus) # now fetch site slots thresholds sitesSSB = self.getInfoFromSSB() if not sitesSSB: logging.error("One or more of the SSB metrics is down. Please contact the Dashboard team.") return logging.debug("Info from SSB: %s", sitesSSB) # get number of agents working in the same team (not in DrainMode) self.getAgentsByTeam() # Check which site slots need to be updated in the database self.checkSlotsChanges(sitesRC, sitesSSB) except Exception as ex: logging.error("Error occurred, will retry later:") logging.error(str(ex)) logging.error("Trace back: \n%s", traceback.format_exc()) logging.info("Resource control cycle finished updating site state and thresholds.") def getAgentsByTeam(self): """ _getAgentsByTeam_ Get the WMStats view for agents and teams """ if isDrainMode(self.config): # maximize pending thresholds to get this agent drained ASAP self.agentsNumByTeam = 1 return agentsByTeam = {} try: agentsByTeam = self.centralCouchDBReader.agentsByTeam(filterDrain=True) except Exception: logging.error("WMStats is not available or is unresponsive.") if not agentsByTeam: logging.warning("agentInfo couch view is not available, use default value %s", self.agentsNumByTeam) else: self.agentsNumByTeam = agentsByTeam.get(self.teamName, self.agentsNumByTeam) logging.debug("Agents connected to the same team (not in DrainMode): %d", self.agentsNumByTeam) return def getInfoFromSSB(self): """ _getInfoFromSSB_ Get site status, CPU bound and IO bound from dashboard (SSB). Returns a dict of dicts where the first key is the site name. """ ssbCpuSlots = self.ssb.getMetric(self.cpuBoundMetric) ssbIoSlots = self.ssb.getMetric(self.ioBoundMetric) ssbSiteSlots = self.thresholdsByVOName(ssbCpuSlots, ssbIoSlots) return ssbSiteSlots def checkStatusChanges(self, infoRC, infoSSB): """ _checkStatusChanges_ Checks which sites need to have their site state updated in resource control, based on: 1. settings defined for the component (config.py) 2. site state changes between SSB and RC """ # First sets list of forced sites to down (HLT @FNAL is an example) for site in self.forceSiteDown: if site in infoRC and infoRC[site]['state'] != 'Down': logging.info("Forcing site %s to Down", site) self.updateSiteState(site, 'Down') infoSSB.pop(site, None) # if onlySSB sites, force all the sites not in SSB to down if self.onlySSB: for site in set(infoRC).difference(set(infoSSB)): if infoRC[site]['state'] != 'Down': logging.info('Only SSBsites, forcing site %s to Down', site) self.updateSiteState(site, 'Down') # normally set all the others for site in set(infoRC).intersection(set(infoSSB)): if infoRC[site]['state'] != infoSSB[site]['state']: logging.info('Changing %s state from %s to %s', site, infoRC[site]['state'], infoSSB[site]['state']) self.updateSiteState(site, infoSSB[site]['state']) return def checkSlotsChanges(self, infoRC, infoSSB): """ _checkSlotsChanges_ Checks which sites need to have their running and/or pending slots updated in resource control database, based on: 1. number of agents connected to the same team 2. and slots provided by the Dashboard team (SSB) If site slots are updated, then updates the task level too. """ logging.debug("Settings for site and task pending slots: %s%% and %s%%", self.pendingSlotsSitePercent, self.pendingSlotsTaskPercent) for site in set(infoRC).intersection(set(infoSSB)): if self.tier0Mode and site.startswith('T1_'): # T1 cores utilization for Tier0 infoSSB[site]['slotsCPU'] *= self.t1SitesCores / 100 infoSSB[site]['slotsIO'] *= self.t1SitesCores / 100 else: # round very small sites to the bare minimum infoSSB[site]['slotsCPU'] = max(infoSSB[site]['slotsCPU'], self.minCPUSlots) infoSSB[site]['slotsIO'] = max(infoSSB[site]['slotsIO'], self.minIOSlots) CPUBound = infoSSB[site]['slotsCPU'] IOBound = infoSSB[site]['slotsIO'] sitePending = max(int(CPUBound / self.agentsNumByTeam * self.pendingSlotsSitePercent / 100), self.minCPUSlots) # update site slots, if needed if infoRC[site]['running_slots'] != CPUBound or infoRC[site]['pending_slots'] != sitePending: # Update site running and pending slots logging.info("Updating %s site thresholds for pend/runn: %d/%d", site, sitePending, CPUBound) self.resourceControl.setJobSlotsForSite(site, pendingJobSlots=sitePending, runningJobSlots=CPUBound) # now handle the task level thresholds self.checkTaskSlotsChanges(site, CPUBound, IOBound) def thresholdsByVOName(self, infoCpu, infoIo): """ _thresholdsByVOName_ Creates a dictionary with CPU and IO slots keyed by the site name. If any of the thresholds is missing or has an invalid value, the whole site thresholds is skipped. """ ssbSiteSlots = {} for entry in infoCpu: if entry['Value'] is None: logging.warn('Site %s has invalid thresholds in SSB. Taking no action', entry['VOName']) continue ssbSiteSlots[entry['VOName']] = {'slotsCPU': int(entry['Value'])} # then iterate over the IO slots for entry in infoIo: if entry['VOName'] not in ssbSiteSlots: logging.warn('Site %s does not have CPU thresholds in SSB. Taking no action', entry['VOName']) ssbSiteSlots.pop(entry['VOName'], None) continue if entry['Value'] is None: logging.warn('Site %s has invalid thresholds in SSB. Taking no action', entry['VOName']) ssbSiteSlots.pop(entry['VOName'], None) continue ssbSiteSlots[entry['VOName']]['slotsIO'] = int(entry['Value']) return ssbSiteSlots def getSiteStatus(self): """ _getSiteStatus_ Fetch site state from SSB and map it to agent state """ ssbState = self.ssb.getMetric(self.siteStatusMetric) ssbSiteState = {} for site in ssbState: voname = site['VOName'] status = site['Status'] if voname not in ssbSiteState: statusAgent = self.getState(str(status)) if not statusAgent: logging.error("Unknown status '%s' for site %s, please check SSB", status, voname) else: ssbSiteState[voname] = {'state': statusAgent} else: logging.warning('I have a duplicated status entry in SSB for %s', voname) return ssbSiteState def getState(self, stateSSB): """ _getState_ Translates SSB states into resource control state """ ssb2agent = {'enabled': 'Normal', 'drain': 'Draining', 'disabled': 'Down', 'test': 'Draining'} # 'test' state behaviour varies between production and tier0 agents ssb2agent['test'] = 'Normal' if self.tier0Mode else "Draining" return ssb2agent.get(stateSSB) def updateSiteState(self, siteName, state): """ _updateSiteState_ Update only the site state in the resource control database. """ try: self.resourceControl.changeSiteState(siteName, state) except Exception as ex: logging.error("Failed to update %s state to %s:", siteName, state) logging.error(str(ex)) logging.error("Traceback: \n%s", traceback.format_exc()) return def checkTaskSlotsChanges(self, siteName, CPUBound, IOBound): """ _checkTaskSlotsChanges_ Update the CPU and IOBound slots for a given site. """ siteTaskSlots = self.resourceControl.thresholdBySite(siteName) taskCPUPending = max(int(CPUBound / self.agentsNumByTeam * self.pendingSlotsTaskPercent / 100), self.minCPUSlots) taskIOPending = max(int(IOBound / self.agentsNumByTeam * self.pendingSlotsTaskPercent / 100), self.minIOSlots) updateTasks = False if siteTaskSlots[0]['task_type'] in self.tasksCPU and siteTaskSlots[0]['task_pending_slots'] != taskCPUPending: updateTasks = True elif siteTaskSlots[0]['task_type'] in self.tasksIO and siteTaskSlots[0]['task_pending_slots'] != taskIOPending: updateTasks = True if updateTasks: logging.info("Updating %s CPU tasks thresholds for pend/runn: %d/%d", siteName, taskCPUPending, CPUBound) self.resourceControl.insertThreshold(siteName, taskType=self.tasksCPU, maxSlots=CPUBound, pendingSlots=taskCPUPending) logging.info("Updating %s IO tasks thresholds for pend/runn: %d/%d", siteName, taskIOPending, IOBound) self.resourceControl.insertThreshold(siteName, taskType=self.tasksIO, maxSlots=IOBound, pendingSlots=taskIOPending) if self.tier0Mode: # Set task thresholds for Tier0 logging.debug("Updating %s Express and Repack task thresholds.", siteName) expressSlots = int(CPUBound * self.runningExpressPercent / 100) pendingExpress = int(expressSlots * self.pendingSlotsTaskPercent / 100) self.resourceControl.insertThreshold(siteName, 'Express', expressSlots, pendingExpress) repackSlots = int(CPUBound * self.runningRepackPercent / 100) pendingRepack = int(repackSlots * self.pendingSlotsTaskPercent / 100) self.resourceControl.insertThreshold(siteName, 'Repack', repackSlots, pendingRepack)
class ResourceControlUpdater(BaseWorkerThread): """ Update site status and thresholds from SSB """ def __init__(self, config): """ Initialize """ BaseWorkerThread.__init__(self) # set the workqueue service for REST call self.config = config self.setVariables(self.config) def setVariables(self, config): """ load all the variables from the config file """ # get dashboard url, set metric columns from config self.dashboard = config.AgentStatusWatcher.dashboard self.siteStatusMetric = config.AgentStatusWatcher.siteStatusMetric self.cpuBoundMetric = config.AgentStatusWatcher.cpuBoundMetric self.ioBoundMetric = config.AgentStatusWatcher.ioBoundMetric # set pending percentages from config self.pendingSlotsSitePercent = config.AgentStatusWatcher.pendingSlotsSitePercent self.pendingSlotsTaskPercent = config.AgentStatusWatcher.pendingSlotsTaskPercent self.runningExpressPercent = config.AgentStatusWatcher.runningExpressPercent self.runningRepackPercent = config.AgentStatusWatcher.runningRepackPercent # forced site list self.forcedSiteList = config.AgentStatusWatcher.forcedSiteList # agent teams (for dynamic threshold) and queueParams (drain mode) self.teamNames = config.Agent.teamName self.queueParams = config.WorkQueueManager.queueParams self.agentsNumByTeam = getattr(config.AgentStatusWatcher, 'defaultAgentsNumByTeam', 5) # only SSB sites self.onlySSB = config.AgentStatusWatcher.onlySSB # tier mode self.tier0Mode = hasattr(config, "Tier0Feeder") self.t1SitesCores = config.AgentStatusWatcher.t1SitesCores # switch this component on/off self.enabled = getattr(config.AgentStatusWatcher, 'enabled', True) def setup(self, parameters): """ Set db connection and prepare resource control """ # Interface to WMBS/BossAir db myThread = threading.currentThread() # set resource control self.resourceControl = ResourceControl(config = self.config) # wmstats connection self.centralCouchDBReader = WMStatsReader(self.config.AgentStatusWatcher.centralWMStatsURL) # init variables self.agentsByTeam = {} def algorithm(self, parameters): """ _algorithm_ Update site info about state and thresholds 1. Get information from SSB 2. Get information about teams and agents from WMStats 3. Set site status and set therholds for each valid site Sites from SSB are validated with PhEDEx node names """ # set variables every polling cycle self.setVariables(self.config) if not self.enabled: logging.info("This component is not enabled in the configuration. Doing nothing.") return try: # Get sites in Resource Control currentSites = self.resourceControl.listCurrentSites() logging.debug("Starting algorithm, getting site info from SSB") stateBySite, slotsCPU, slotsIO = self.getInfoFromSSB() if not stateBySite or not slotsCPU or not slotsIO: logging.error("One or more of the SSB metrics is down. Please contact the Dashboard team.") return logging.debug("Setting status and thresholds for all sites, site pending: %s%%, task pending: %s%%" % (str(self.pendingSlotsSitePercent), str(self.pendingSlotsTaskPercent))) # get number of agents working in the same team (not in DrainMode) agentsByTeam = self.getAgentsByTeam() if not agentsByTeam: logging.debug("agentInfo couch view is not available, use previous agent count %s" % self.agentsNumByTeam) else: self.agentsByTeam = agentsByTeam teams = self.teamNames.split(',') agentsCount = [] for team in teams: if team not in self.agentsByTeam: agentsCount.append(1) else: agentsCount.append(self.agentsByTeam[team]) self.agentsNumByTeam = min(agentsCount) # If agent is in several teams, we choose the team with less agents logging.debug("Number of agents not in DrainMode running in the same team: %s" % str(self.agentsNumByTeam)) # set site status and thresholds listSites = stateBySite.keys() if self.forcedSiteList: if set(self.forcedSiteList).issubset(set(listSites)): listSites = self.forcedSiteList logging.info("Forcing site list: %s" % (', '.join(self.forcedSiteList))) else: listSites = self.forcedSiteList logging.warn("Forcing site list: %s. Some site(s) are not in SSB" % (', '.join(self.forcedSiteList))) for site in listSites: if site in currentSites: sitestate = stateBySite.get(site,'Normal') if site not in slotsCPU or site not in slotsIO: logging.warn("%s not available in SSB. Changing only site status to %s." % (site,sitestate)) pluginResponse = self.updateSiteInfo(site, sitestate, None, None, self.agentsNumByTeam) continue pluginResponse = self.updateSiteInfo(site, sitestate, slotsCPU[site], slotsIO[site], self.agentsNumByTeam) if not pluginResponse: continue logging.info('Setting site %s to %s, CPUBound: %s, IOBound: %s' % (site, sitestate, slotsCPU[site], slotsIO[site])) else: logging.debug("Site '%s' has not been added to Resource Control" % site) # if onlySSB sites or forcedSiteList, force to down all the sites not in SSB/forcedSiteList if self.onlySSB or self.forcedSiteList: for site in set(currentSites).difference(set(listSites)): pluginResponse = self.updateSiteInfo(site, 'Down', 0, 0, self.agentsNumByTeam) if not pluginResponse: continue logging.info('Only SSBsites/forcedSiteList, forcing site %s to Down, CPUBound: 0, IOBound: 0' % site) logging.info("Resource update is completed, waiting for the next cycle.\n") except Exception as ex: logging.error("Error occurred, will retry later:") logging.error(str(ex)) logging.error("Trace back: \n%s" % traceback.format_exc()) def getAgentsByTeam(self): """ _getAgentsByTeam_ Get the WMStats view about agents and teams """ agentsByTeam = [] try: agentsByTeam = self.centralCouchDBReader.agentsByTeam() return agentsByTeam except Exception as ex: logging.error("WMStats is not available or is unresponsive. Don't divide thresholds by team") return agentsByTeam def getInfoFromSSB(self): """ _getInfoFromSSB_ Get site status, CPU bound and IO bound from dashboard (SSB) """ # urls from site status board url_site_state = self.dashboard + '/request.py/getplotdata?columnid=%s&batch=1&lastdata=1' % str(self.siteStatusMetric) url_cpu_bound = self.dashboard + '/request.py/getplotdata?columnid=%s&batch=1&lastdata=1' % str(self.cpuBoundMetric) url_io_bound = self.dashboard + '/request.py/getplotdata?columnid=%s&batch=1&lastdata=1' % str(self.ioBoundMetric) # get info from dashboard sites = urllib2.urlopen(url_site_state).read() cpu_bound = urllib2.urlopen(url_cpu_bound).read() io_bound = urllib2.urlopen(url_io_bound).read() # parse from json format to dictionary, get only 'csvdata' site_state = json.loads(sites)['csvdata'] cpu_slots = json.loads(cpu_bound)['csvdata'] io_slots = json.loads(io_bound)['csvdata'] # dictionaries with status/thresholds info by VOName stateBySite = self.siteStatusByVOName(site_state) slotsCPU = self.thresholdsByVOName(cpu_slots) slotsIO = self.thresholdsByVOName(io_slots) return stateBySite, slotsCPU, slotsIO def thresholdsByVOName(self, sites): """ _thresholdsByVOName_ Creates a dictionary with keys->VOName and values->threshold: """ thresholdbyVOName = {} for site in sites: voname = site['VOName'] value = site['Value'] if voname not in thresholdbyVOName: if value is None: logging.warn('Site %s does not have threholds in SSB, assuming 0' % voname) thresholdbyVOName[voname] = 0 else: thresholdbyVOName[voname] = int(value) else: logging.error('I have a duplicated threshold entry in SSB for %s' % voname) return thresholdbyVOName def siteStatusByVOName(self, sites): """ _siteStatusByVOName_ Creates a dictionary with keys->VOName and values->status: """ statusBySite = {} for site in sites: voname = site['VOName'] status = site['Status'] if voname not in statusBySite: if not status: logging.error('Site %s does not have status in SSB' % voname) continue new_status = self.getState(str(status)) if not new_status: logging.error("Unkwown status '%s' for site %s, please check SSB" % (str(status), voname)) continue statusBySite[voname] = new_status else: logging.error('I have a duplicated status entry in SSB for %s' % voname) return statusBySite def getState(self, stateFromSSB): """ _getState_ Translate SSB states into resource control state """ if stateFromSSB == "on": return "Normal" elif stateFromSSB == "drain": return "Draining" elif stateFromSSB == "tier0": logging.debug('There is a site in tier0 status (Tier0Mode is %s)' % self.tier0Mode ) if self.tier0Mode: return "Normal" else: return "Draining" elif stateFromSSB == "down": return "Down" elif stateFromSSB == "skip": return "Down" else: return None def updateSiteInfo(self, siteName, state, CPUBound, IOBound, agentsNum): """ _updateSiteInfo_ Update information about a site in the database. Also set thresholds for a given site pending_jobs policy: sitePending is CPUBound*(pendingSlotsSitePercent/100) taskPending is (CPUBound or IOBound)*(pendingSlotsTaskPercent/100) depending on the task type This allows to maintain the right pressure in the queue, and keep the agent safe. The site threshold is higger than each task threshold. This allow to have different task type jobs in the queue. When there is several agents in the same team, we divide the pending threshold between the number of agents running. """ if self.resourceControl.listSiteInfo(siteName) is None: logging.warn("Site %s has not been added to the resource control. Please check if the site was added by the condor plugin" % siteName) return False # set site state: self.resourceControl.changeSiteState(siteName, state) if CPUBound == None or IOBound == None: return True # tier0 T1 cores utilization if self.tier0Mode and 'T1_' in siteName: CPUBound = CPUBound*self.t1SitesCores/100 IOBound = IOBound*self.t1SitesCores/100 # Thresholds: sitePending = int(CPUBound/agentsNum*self.pendingSlotsSitePercent/100) taskCPUPending = int(CPUBound/agentsNum*self.pendingSlotsTaskPercent/100) taskIOPending = int(IOBound/agentsNum*self.pendingSlotsTaskPercent/100) # min pending values for thresholds if taskCPUPending < 10 and taskCPUPending > 0: taskCPUPending = 10 if taskIOPending < 10 and taskIOPending > 0: taskIOPending = 10 # Set site main thresholds self.resourceControl.setJobSlotsForSite(siteName = siteName, pendingJobSlots = sitePending, runningJobSlots = CPUBound) # Set thresholds for CPU bound task types cpuTasks = ['Processing', 'Production', 'Analysis'] for task in cpuTasks: self.resourceControl.insertThreshold(siteName = siteName, taskType = task, maxSlots = CPUBound, pendingSlots = taskCPUPending) # Set thresholds for IO bound task types ioTasks = ['Merge', 'Cleanup', 'Harvesting', 'LogCollect', 'Skim'] for task in ioTasks: self.resourceControl.insertThreshold(siteName = siteName, taskType = task, maxSlots = IOBound, pendingSlots = taskIOPending) if self.tier0Mode: # Set thresholds for tier0 task types expressSlots = int(CPUBound*self.runningExpressPercent/100) pendingExpress = int(expressSlots*self.pendingSlotsTaskPercent/100) self.resourceControl.insertThreshold(siteName = siteName, taskType = 'Express', maxSlots = expressSlots, pendingSlots = pendingExpress) repackSlots = int(CPUBound*self.runningRepackPercent/100) pendingRepack = int(repackSlots*self.pendingSlotsTaskPercent/100) self.resourceControl.insertThreshold(siteName = siteName, taskType = 'Repack', maxSlots = repackSlots, pendingSlots = pendingRepack) return True