def checkReplicationStatus(self, continuous = True): """ _checkReplicationStatus_ Check if the workqueue replication is ok, if not then delete the documents so that new replications can be triggered when appropiate. It returns True if there is no error, and False otherwise. """ if self.parentCouchUrl and self.queueUrl: # only checks for local queue couchMonitor = CouchMonitor(self.server) filter = 'WorkQueue/queueFilter' query_params = {'childUrl' : self.queueUrl, 'parentUrl' : self.parentCouchUrl} self.logger.info("set replication from GQ to LQ") couchMonitor.recoverReplicationErrors(self.parentCouchUrl, "%s/%s" % (self.hostWithAuth, self.inbox.name), filter = filter, query_params = query_params, checkUpdateSeq = False, continuous = continuous) self.logger.info("set replication from LQ to GQ") couchMonitor.recoverReplicationErrors(self.inbox.name, self.parentCouchUrlWithAuth, filter = filter, query_params = query_params, checkUpdateSeq = False, continuous = continuous) return True return False
def setUp(self): """ _setUp_ Setup the database and logging connection. Try to create all of the WMBS tables. Also add some dummy locations. """ super(WorkQueueTestCase, self).setUp() self.queueDB = 'workqueue_t' self.queueInboxDB = 'workqueue_t_inbox' self.globalQDB = 'workqueue_t_global' self.globalQInboxDB = 'workqueue_t_global_inbox' self.localQDB = 'workqueue_t_local' self.localQInboxDB = 'workqueue_t_local_inbox' self.localQDB2 = 'workqueue_t_local2' self.localQInboxDB2 = 'workqueue_t_local2_inbox' self.configCacheDB = 'workqueue_t_config_cache' self.logDBName = 'logdb_t' self.requestDBName = 'workqueue_t_reqmgr_workload_cache' self.setSchema() self.testInit = TestInit('WorkQueueTest') self.testInit.setLogging() self.testInit.setDatabaseConnection(destroyAllDatabase=True) self.addCleanup(self.testInit.clearDatabase) self.addCleanup(logging.debug, 'Cleanup called clearDatabase()') self.testInit.setSchema(customModules = self.schema, useDefault = False) self.testInit.setupCouch(self.queueDB, *self.couchApps) self.testInit.setupCouch(self.queueInboxDB, *self.couchApps) self.testInit.setupCouch(self.globalQDB, *self.couchApps) self.testInit.setupCouch(self.globalQInboxDB , *self.couchApps) self.testInit.setupCouch(self.localQDB, *self.couchApps) self.testInit.setupCouch(self.localQInboxDB, *self.couchApps) self.testInit.setupCouch(self.localQDB2, *self.couchApps) self.testInit.setupCouch(self.localQInboxDB2, *self.couchApps) self.testInit.setupCouch(self.configCacheDB, 'ConfigCache') self.testInit.setupCouch(self.logDBName, 'LogDB') self.testInit.setupCouch(self.requestDBName, 'ReqMgr') self.couchURL = os.environ.get("COUCHURL") couchServer = CouchServer(self.couchURL) self.configCacheDBInstance = couchServer.connectDatabase(self.configCacheDB) self.localCouchMonitor = CouchMonitor(self.couchURL) self.localCouchMonitor.deleteReplicatorDocs() self.addCleanup(self.localCouchMonitor.deleteReplicatorDocs) self.addCleanup(logging.debug, 'Cleanup called deleteReplicatorDocs()') self.addCleanup(self.testInit.tearDownCouch) self.addCleanup(logging.debug, 'Cleanup called tearDownCouch()') self.workDir = self.testInit.generateWorkDir() self.addCleanup(self.testInit.delWorkDir) self.addCleanup(logging.debug, 'Cleanup called delWorkDir()') return
def setup(self, parameters): """ set db connection(couchdb, wmbs) to prepare to gather information """ # interface to WMBS/BossAir db myThread = threading.currentThread() # set wmagent db data self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger) self.centralWMStatsCouchDB = WMStatsWriter(self.config.General.centralWMStatsURL) self.localCouchMonitor = CouchMonitor(self.config.JobStateMachine.couchurl) self.setUpCouchDBReplication()
def setup(self, parameters): """ set db connection(couchdb, wmbs) to prepare to gather information """ # set the connection to local queue self.localQueue = WorkQueueService(self.config.AnalyticsDataCollector.localQueueURL) # set the connection for local couchDB call self.localCouchDB = LocalCouchDBData(self.config.AnalyticsDataCollector.localCouchURL, self.config.JobStateMachine.summaryStatsDBName, self.summaryLevel) # interface to WMBS/BossAir db myThread = threading.currentThread() # set wmagent db data self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger) # set the connection for local couchDB call self.localSummaryCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.localWMStatsURL, "WMStatsAgent") if hasattr(self.config, "Tier0Feeder"): #use local db for tier0 centralRequestCouchDBURL = self.config.AnalyticsDataCollector.localT0RequestDBURL else: centralRequestCouchDBURL = self.config.AnalyticsDataCollector.centralRequestDBURL self.centralRequestCouchDB = RequestDBWriter(centralRequestCouchDBURL, couchapp = self.config.AnalyticsDataCollector.RequestCouchApp) #TODO: change the config to hold couch url self.localCouchServer = CouchMonitor(self.config.JobStateMachine.couchurl) if self.pluginName != None: pluginFactory = WMFactory("plugins", "WMComponent.AnalyticsDataCollector.Plugins") self.plugin = pluginFactory.loadObject(classname = self.pluginName)
def checkReplicationStatus(self, continuous=True): """ _checkReplicationStatus_ Check if the workqueue replication is ok, if not then delete the documents so that new replications can be triggered when appropiate. It returns True if there is no error, and False otherwise. """ if self.parentCouchUrl and self.queueUrl: # only checks for local queue couchMonitor = CouchMonitor(self.server) filter = 'WorkQueue/queueFilter' query_params = { 'childUrl': self.queueUrl, 'parentUrl': self.parentCouchUrl } self.logger.info("set replication from GQ to LQ") couchMonitor.recoverReplicationErrors( self.parentCouchUrl, "%s/%s" % (self.hostWithAuth, self.inbox.name), filter=filter, query_params=query_params, checkUpdateSeq=False, continuous=continuous) self.logger.info("set replication from LQ to GQ") couchMonitor.recoverReplicationErrors(self.inbox.name, self.parentCouchUrlWithAuth, filter=filter, query_params=query_params, checkUpdateSeq=False, continuous=continuous) return True return False
def setup(self, parameters): """ set db connection(couchdb, wmbs) to prepare to gather information """ # interface to WMBS/BossAir db myThread = threading.currentThread() # set wmagent db data self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger) if hasattr(self.config, "Tier0Feeder"): self.centralWMStatsCouchDB = WMStatsWriter( self.config.AnalyticsDataCollector.localWMStatsURL, appName="WMStatsAgent") else: self.centralWMStatsCouchDB = WMStatsWriter( self.config.AnalyticsDataCollector.centralWMStatsURL) self.localCouchMonitor = CouchMonitor( self.config.JobStateMachine.couchurl) self.setUpCouchDBReplication()
def setup(self, parameters): """ set db connection(couchdb, wmbs) to prepare to gather information """ # interface to WMBS/BossAir db myThread = threading.currentThread() # set wmagent db data self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger) self.centralWMStatsCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.centralWMStatsURL) self.localCouchMonitor = CouchMonitor(self.config.JobStateMachine.couchurl) self.setUpCouchDBReplication()
def setup(self, parameters): """ set db connection(couchdb, wmbs) to prepare to gather information """ # interface to WMBS/BossAir db myThread = threading.currentThread() # set wmagent db data self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger) # set the connection for local couchDB call #self.localSummaryCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.localWMStatsURL) self.centralWMStatsCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.centralWMStatsURL) self.localCouchServer = CouchMonitor(self.config.JobStateMachine.couchurl)
class AgentStatusPoller(BaseWorkerThread): """ Gether the summary data for request (workflow) from local queue, local job couchdb, wmbs/boss air and populate summary db for monitoring """ def __init__(self, config): """ initialize properties specified from config """ BaseWorkerThread.__init__(self) # set the workqueue service for REST call self.config = config # need to get campaign, user, owner info self.agentInfo = initAgentInfo(self.config) self.summaryLevel = config.AnalyticsDataCollector.summaryLevel self.jsonFile = config.AgentStatusWatcher.jsonFile proxyArgs = {'logger': logging.getLogger()} self.proxy = Proxy(proxyArgs) self.proxyFile = self.proxy.getProxyFilename() # X509_USER_PROXY localWQUrl = config.AnalyticsDataCollector.localQueueURL self.workqueueDS = WorkQueueDS(localWQUrl) def setUpCouchDBReplication(self): self.replicatorDocs = [] # set up common replication code wmstatsSource = self.config.JobStateMachine.jobSummaryDBName wmstatsTarget = self.config.AnalyticsDataCollector.centralWMStatsURL self.replicatorDocs.append({'source': wmstatsSource, 'target': wmstatsTarget, 'filter': "WMStatsAgent/repfilter"}) # TODO: tier0 specific code - need to make it generic if hasattr(self.config, "Tier0Feeder"): t0Source = self.config.Tier0Feeder.requestDBName t0Target = self.config.AnalyticsDataCollector.centralRequestDBURL self.replicatorDocs.append({'source': t0Source, 'target': t0Target, 'filter': "T0Request/repfilter"}) else: # set up workqueue replication wqfilter = 'WorkQueue/queueFilter' parentQURL = self.config.WorkQueueManager.queueParams["ParentQueueCouchUrl"] childURL = self.config.WorkQueueManager.queueParams["QueueURL"] query_params = {'childUrl': childURL, 'parentUrl': sanitizeURL(parentQURL)['url']} localQInboxURL = "%s_inbox" % self.config.AnalyticsDataCollector.localQueueURL self.replicatorDocs.append({'source': sanitizeURL(parentQURL)['url'], 'target': localQInboxURL, 'filter': wqfilter, 'query_params': query_params}) self.replicatorDocs.append({'source': sanitizeURL(localQInboxURL)['url'], 'target': parentQURL, 'filter': wqfilter, 'query_params': query_params}) # delete old replicator docs before setting up self.localCouchMonitor.deleteReplicatorDocs() for rp in self.replicatorDocs: self.localCouchMonitor.couchServer.replicate( rp['source'], rp['target'], filter=rp['filter'], query_params=rp.get('query_params', False), continuous=True) # First cicle need to be skipped since document is not updated that fast self.skipReplicationCheck = True def setup(self, parameters): """ set db connection(couchdb, wmbs) to prepare to gather information """ # interface to WMBS/BossAir db myThread = threading.currentThread() # set wmagent db data self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger) self.centralWMStatsCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.centralWMStatsURL) self.localCouchMonitor = CouchMonitor(self.config.JobStateMachine.couchurl) self.setUpCouchDBReplication() def algorithm(self, parameters): """ get information from wmbs, workqueue and local couch """ try: agentInfo = self.collectAgentInfo() self.checkProxyLifetime(agentInfo) timeSpent, wmbsInfo, _ = self.collectWMBSInfo() wmbsInfo['total_query_time'] = int(timeSpent) agentInfo["WMBS_INFO"] = wmbsInfo logging.info("WMBS data collected in: %d secs", timeSpent) if not hasattr(self.config, "Tier0Feeder"): # Tier0 Agent doesn't have LQ. timeSpent, localWQInfo, _ = self.collectWorkQueueInfo() localWQInfo['total_query_time'] = int(timeSpent) agentInfo["LocalWQ_INFO"] = localWQInfo logging.info("Local WorkQueue data collected in: %d secs", timeSpent) uploadTime = int(time.time()) self.uploadAgentInfoToCentralWMStats(agentInfo, uploadTime) # save locally json file as well with open(self.jsonFile, 'w') as outFile: json.dump(agentInfo, outFile, indent=2) except Exception as ex: logging.exception("Error occurred, will retry later.\nDetails: %s", str(ex)) @timeFunction def collectWorkQueueInfo(self): """ Collect information from local workqueue database :return: """ results = {} results['workByStatus'] = self.workqueueDS.getJobsByStatus() results['workByStatusAndPriority'] = self.workqueueDS.getJobsByStatusAndPriority() elements = self.workqueueDS.getElementsByStatus(['Available', 'Acquired']) uniSites, posSites = getGlobalSiteStatusSummary(elements, dataLocality=True) results['uniqueJobsPerSite'] = uniSites results['possibleJobsPerSite'] = posSites return results def collectCouchDBInfo(self): couchInfo = {'name': 'CouchServer', 'status': 'ok', 'error_message': ""} if self.skipReplicationCheck: # skipping the check this round set if False so it can be checked next round. self.skipReplicationCheck = False return couchInfo for rp in self.replicatorDocs: cInfo = self.localCouchMonitor.checkCouchServerStatus(rp['source'], rp['target'], checkUpdateSeq=False) if cInfo['status'] != 'ok': couchInfo['status'] = 'error' couchInfo['error_message'] = cInfo['error_message'] return couchInfo def collectAgentInfo(self): """ Monitors the general health of the agent, as: 1. status of the agent processes 2. status of the agent threads based on the database info 3. couchdb active tasks and its replications 4. check the disk usage 5. check the number of couch processes :return: a dict with all the info collected """ logging.info("Getting agent info ...") agentInfo = self.wmagentDB.getComponentStatus(self.config) agentInfo.update(self.agentInfo) agentInfo['disk_warning'] = listDiskUsageOverThreshold(self.config, updateDB=True) if isDrainMode(self.config): logging.info("Agent is in DrainMode") agentInfo['drain_mode'] = True agentInfo['drain_stats'] = DrainStatusPoller.getDrainInfo() else: agentInfo['drain_mode'] = False couchInfo = self.collectCouchDBInfo() if couchInfo['status'] != 'ok': agentInfo['down_components'].append(couchInfo['name']) agentInfo['status'] = couchInfo['status'] agentInfo['down_component_detail'].append(couchInfo) # Couch process warning couchProc = numberCouchProcess() logging.info("CouchDB is running with %d processes", couchProc) couchProcessThreshold = self.config.AnalyticsDataCollector.couchProcessThreshold if couchProc >= couchProcessThreshold: agentInfo['couch_process_warning'] = couchProc else: agentInfo['couch_process_warning'] = 0 # This adds the last time and message when data was updated to agentInfo lastDataUpload = DataUploadTime.getInfo() if lastDataUpload['data_last_update']: agentInfo['data_last_update'] = lastDataUpload['data_last_update'] if lastDataUpload['data_error']: agentInfo['data_error'] = lastDataUpload['data_error'] # Change status if there is data_error, couch process maxed out or disk full problems. if agentInfo['status'] == 'ok' and (agentInfo['drain_mode'] or agentInfo['disk_warning']): agentInfo['status'] = "warning" if agentInfo['status'] == 'ok' or agentInfo['status'] == 'warning': if agentInfo.get('data_error', 'ok') != 'ok' or agentInfo.get('couch_process_warning', 0): agentInfo['status'] = "error" logging.info("List of agent components down: %s", agentInfo['down_components']) return agentInfo def uploadAgentInfoToCentralWMStats(self, agentInfo, uploadTime): # direct data upload to the remote to prevent data conflict when agent is cleaned up and redeployed agentDocs = convertToAgentCouchDoc(agentInfo, self.config.ACDC, uploadTime) self.centralWMStatsCouchDB.updateAgentInfo(agentDocs) @timeFunction def collectWMBSInfo(self): """ Fetches WMBS job information. In addition to WMBS, also collects RunJob info from BossAir :return: dict with the number of jobs in each status """ logging.info("Getting wmbs job info ...") results = {} # first retrieve the site thresholds results['thresholds'] = self.wmagentDB.getJobSlotInfo() logging.debug("Running and pending site thresholds: %s", results['thresholds']) # now fetch the amount of jobs in each state and the amount of created # jobs grouped by task results.update(self.wmagentDB.getAgentMonitoring()) logging.debug("Total number of jobs in WMBS sorted by status: %s", results['wmbsCountByState']) logging.debug("Total number of 'created' jobs in WMBS sorted by type: %s", results['wmbsCreatedTypeCount']) logging.debug("Total number of 'executing' jobs in WMBS sorted by type: %s", results['wmbsExecutingTypeCount']) logging.debug("Total number of active jobs in BossAir sorted by status: %s", results['activeRunJobByStatus']) logging.debug("Total number of complete jobs in BossAir sorted by status: %s", results['completeRunJobByStatus']) logging.debug("Available slots thresholds to pull work from GQ to LQ: %s", results['thresholdsGQ2LQ']) logging.debug("List of jobs pending for each site, sorted by priority: %s", results['sitePendCountByPrio']) return results def checkProxyLifetime(self, agInfo): """ Check the proxy lifetime (usually X509_USER_CERT) and raise either a warning or an error if the proxy validity is about to expire. :param agInfo: dictionary with plenty of agent monitoring information in place. :return: same dictionary object plus additional keys/values if needed. """ secsLeft = self.proxy.getTimeLeft(proxy=self.proxyFile) logging.debug("Proxy '%s' lifetime is %d secs", self.proxyFile, secsLeft) if secsLeft <= 86400 * 3: # 3 days proxyWarning = True agInfo['status'] = "error" elif secsLeft <= 86400 * 5: # 5 days proxyWarning = True if agInfo['status'] == "ok": agInfo['status'] = "warning" else: proxyWarning = False if proxyWarning: warnMsg = "Agent proxy '%s' must be renewed ASAP. " % self.proxyFile warnMsg += "Its time left is: %.2f hours." % (secsLeft / 3600.) agInfo['proxy_warning'] = warnMsg return
class WorkQueueTestCase(EmulatedUnitTestCase): def setSchema(self): "this can be override if the schema setting is different" self.schema = ["WMCore.WMBS","WMComponent.DBS3Buffer","WMCore.BossAir"] self.couchApps = ["WorkQueue"] def setUp(self): """ _setUp_ Setup the database and logging connection. Try to create all of the WMBS tables. Also add some dummy locations. """ super(WorkQueueTestCase, self).setUp() self.queueDB = 'workqueue_t' self.queueInboxDB = 'workqueue_t_inbox' self.globalQDB = 'workqueue_t_global' self.globalQInboxDB = 'workqueue_t_global_inbox' self.localQDB = 'workqueue_t_local' self.localQInboxDB = 'workqueue_t_local_inbox' self.localQDB2 = 'workqueue_t_local2' self.localQInboxDB2 = 'workqueue_t_local2_inbox' self.configCacheDB = 'workqueue_t_config_cache' self.logDBName = 'logdb_t' self.requestDBName = 'workqueue_t_reqmgr_workload_cache' self.setSchema() self.testInit = TestInit('WorkQueueTest') self.testInit.setLogging() self.testInit.setDatabaseConnection(destroyAllDatabase=True) self.addCleanup(self.testInit.clearDatabase) self.addCleanup(logging.debug, 'Cleanup called clearDatabase()') self.testInit.setSchema(customModules = self.schema, useDefault = False) self.testInit.setupCouch(self.queueDB, *self.couchApps) self.testInit.setupCouch(self.queueInboxDB, *self.couchApps) self.testInit.setupCouch(self.globalQDB, *self.couchApps) self.testInit.setupCouch(self.globalQInboxDB , *self.couchApps) self.testInit.setupCouch(self.localQDB, *self.couchApps) self.testInit.setupCouch(self.localQInboxDB, *self.couchApps) self.testInit.setupCouch(self.localQDB2, *self.couchApps) self.testInit.setupCouch(self.localQInboxDB2, *self.couchApps) self.testInit.setupCouch(self.configCacheDB, 'ConfigCache') self.testInit.setupCouch(self.logDBName, 'LogDB') self.testInit.setupCouch(self.requestDBName, 'ReqMgr') self.couchURL = os.environ.get("COUCHURL") couchServer = CouchServer(self.couchURL) self.configCacheDBInstance = couchServer.connectDatabase(self.configCacheDB) self.localCouchMonitor = CouchMonitor(self.couchURL) self.localCouchMonitor.deleteReplicatorDocs() self.addCleanup(self.localCouchMonitor.deleteReplicatorDocs) self.addCleanup(logging.debug, 'Cleanup called deleteReplicatorDocs()') self.addCleanup(self.testInit.tearDownCouch) self.addCleanup(logging.debug, 'Cleanup called tearDownCouch()') self.workDir = self.testInit.generateWorkDir() self.addCleanup(self.testInit.delWorkDir) self.addCleanup(logging.debug, 'Cleanup called delWorkDir()') return def tearDown(self): """ _tearDown_ Drop all the WMBS tables. """ super(WorkQueueTestCase, self).tearDown() # Left here in case it's needed by any of the sub-classes return
class AgentStatusPoller(BaseWorkerThread): """ Gether the summary data for request (workflow) from local queue, local job couchdb, wmbs/boss air and populate summary db for monitoring """ def __init__(self, config): """ initialize properties specified from config """ BaseWorkerThread.__init__(self) # set the workqueue service for REST call self.config = config # need to get campaign, user, owner info self.agentInfo = initAgentInfo(self.config) self.summaryLevel = config.AnalyticsDataCollector.summaryLevel self.jsonFile = config.AgentStatusWatcher.jsonFile # counter for deep agent monitoring. Every 15min (3 cycles of the component) self.monitorCounter = 0 self.monitorInterval = getattr(config.AgentStatusWatcher, 'monitorPollInterval', 3) def setUpCouchDBReplication(self): self.replicatorDocs = [] # set up common replication code wmstatsSource = self.config.JobStateMachine.jobSummaryDBName wmstatsTarget = self.config.AnalyticsDataCollector.centralWMStatsURL self.replicatorDocs.append({ 'source': wmstatsSource, 'target': wmstatsTarget, 'filter': "WMStatsAgent/repfilter" }) #TODO: tier0 specific code - need to make it generic if hasattr(self.config, "Tier0Feeder"): t0Source = self.config.Tier0Feeder.requestDBName t0Target = self.config.AnalyticsDataCollector.centralRequestDBURL self.replicatorDocs.append({ 'source': t0Source, 'target': t0Target, 'filter': "T0Request/repfilter" }) else: # set up workqueue replication wqfilter = 'WorkQueue/queueFilter' parentQURL = self.config.WorkQueueManager.queueParams[ "ParentQueueCouchUrl"] childURL = self.config.WorkQueueManager.queueParams["QueueURL"] query_params = { 'childUrl': childURL, 'parentUrl': sanitizeURL(parentQURL)['url'] } localQInboxURL = "%s_inbox" % self.config.AnalyticsDataCollector.localQueueURL self.replicatorDocs.append({ 'source': sanitizeURL(parentQURL)['url'], 'target': localQInboxURL, 'filter': wqfilter, 'query_params': query_params }) self.replicatorDocs.append({ 'source': sanitizeURL(localQInboxURL)['url'], 'target': parentQURL, 'filter': wqfilter, 'query_params': query_params }) # delete old replicator docs before setting up self.localCouchMonitor.deleteReplicatorDocs() for rp in self.replicatorDocs: self.localCouchMonitor.couchServer.replicate( rp['source'], rp['target'], filter=rp['filter'], query_params=rp.get('query_params', False), continuous=True) # First cicle need to be skipped since document is not updated that fast self.skipReplicationCheck = True def setup(self, parameters): """ set db connection(couchdb, wmbs) to prepare to gather information """ # interface to WMBS/BossAir db myThread = threading.currentThread() # set wmagent db data self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger) self.centralWMStatsCouchDB = WMStatsWriter( self.config.AnalyticsDataCollector.centralWMStatsURL) self.localCouchMonitor = CouchMonitor( self.config.JobStateMachine.couchurl) self.setUpCouchDBReplication() def algorithm(self, parameters): """ get information from wmbs, workqueue and local couch """ try: agentInfo = self.collectAgentInfo() #set the uploadTime - should be the same for all docs uploadTime = int(time.time()) self.uploadAgentInfoToCentralWMStats(agentInfo, uploadTime) if self.monitorCounter % self.monitorInterval == 0: monitoring = self.collectWMBSInfo() monitoring['components'] = agentInfo['down_components'] monitoring['timestamp'] = int(time.time()) with open(self.jsonFile, 'w') as outFile: json.dump(monitoring, outFile, indent=2) self.monitorCounter += 1 except Exception as ex: logging.error("Error occurred, will retry later:") logging.error(str(ex)) logging.error("Trace back: \n%s" % traceback.format_exc()) def collectCouchDBInfo(self): couchInfo = { 'name': 'CouchServer', 'status': 'ok', 'error_message': "" } if self.skipReplicationCheck: # skipping the check this round set if False so it can be checked next round. self.skipReplicationCheck = False return couchInfo for rp in self.replicatorDocs: cInfo = self.localCouchMonitor.checkCouchServerStatus( rp['source'], rp['target'], checkUpdateSeq=False) if cInfo['status'] != 'ok': couchInfo['status'] = 'error' couchInfo['error_message'] = cInfo['error_message'] return couchInfo def collectAgentInfo(self): """ Monitors the general health of the agent, as: 1. status of the agent processes 2. status of the agent threads based on the database info 3. couchdb active tasks and its replications 4. check the disk usage 5. check the number of couch processes :return: a dict with all the info collected """ logging.info("Getting agent info ...") agentInfo = self.wmagentDB.getComponentStatus(self.config) agentInfo.update(self.agentInfo) if isDrainMode(self.config): logging.info("Agent is in DrainMode") agentInfo['drain_mode'] = True else: agentInfo['drain_mode'] = False couchInfo = self.collectCouchDBInfo() if couchInfo['status'] != 'ok': agentInfo['down_components'].append(couchInfo['name']) agentInfo['status'] = couchInfo['status'] agentInfo['down_component_detail'].append(couchInfo) # Disk space warning diskUseList = diskUse() diskUseThreshold = float( self.config.AnalyticsDataCollector.diskUseThreshold) agentInfo['disk_warning'] = [] for disk in diskUseList: if float(disk['percent'].strip('%')) >= diskUseThreshold and \ disk['mounted'] not in self.config.AnalyticsDataCollector.ignoreDisk: agentInfo['disk_warning'].append(disk) # Couch process warning couchProc = numberCouchProcess() logging.info("CouchDB is running with %d processes", couchProc) couchProcessThreshold = self.config.AnalyticsDataCollector.couchProcessThreshold if couchProc >= couchProcessThreshold: agentInfo['couch_process_warning'] = couchProc else: agentInfo['couch_process_warning'] = 0 # This adds the last time and message when data was updated to agentInfo lastDataUpload = DataUploadTime.getInfo() if lastDataUpload['data_last_update']: agentInfo['data_last_update'] = lastDataUpload['data_last_update'] if lastDataUpload['data_error']: agentInfo['data_error'] = lastDataUpload['data_error'] # Change status if there is data_error, couch process maxed out or disk full problems. if agentInfo['status'] == 'ok' and (agentInfo['drain_mode'] or agentInfo['disk_warning']): agentInfo['status'] = "warning" if agentInfo['status'] == 'ok' or agentInfo['status'] == 'warning': if agentInfo.get('data_error', 'ok') != 'ok' or agentInfo.get( 'couch_process_warning', 0): agentInfo['status'] = "error" if agentInfo['down_components']: logging.info("List of agent components down: %s" % agentInfo['down_components']) return agentInfo def uploadAgentInfoToCentralWMStats(self, agentInfo, uploadTime): #direct data upload to the remote to prevent data conflict when agent is cleaned up and redeployed agentDocs = convertToAgentCouchDoc(agentInfo, self.config.ACDC, uploadTime) self.centralWMStatsCouchDB.updateAgentInfo(agentDocs) def collectWMBSInfo(self): """ Fetches WMBS job information. In addition to WMBS, also collects RunJob info from BossAir :return: dict with the number of jobs in each status """ results = {} logging.info("Getting wmbs job info ...") # first retrieve the site thresholds results['thresholds'] = self.wmagentDB.getJobSlotInfo() logging.info("Running and pending site thresholds: %s", results['thresholds']) # now fetch the amount of jobs in each state and the amount of created # jobs grouped by task results.update(self.wmagentDB.getAgentMonitoring()) logging.info("Total number of jobs in WMBS sorted by status: %s", results['wmbsCountByState']) logging.info( "Total number of 'created' jobs in WMBS sorted by type: %s", results['wmbsCreatedTypeCount']) logging.info( "Total number of 'executing' jobs in WMBS sorted by type: %s", results['wmbsExecutingTypeCount']) logging.info( "Total number of active jobs in BossAir sorted by status: %s", results['activeRunJobByStatus']) logging.info( "Total number of complete jobs in BossAir sorted by status: %s", results['completeRunJobByStatus']) logging.info( "Available slots thresholds to pull work from GQ to LQ: %s", results['thresholdsGQ2LQ']) logging.info( "List of jobs pending for each site, sorted by priority: %s", results['sitePendCountByPrio']) return results
class AgentStatusPoller(BaseWorkerThread): """ Gether the summary data for request (workflow) from local queue, local job couchdb, wmbs/boss air and populate summary db for monitoring """ def __init__(self, config): """ initialize properties specified from config """ BaseWorkerThread.__init__(self) # set the workqueue service for REST call self.config = config # need to get campaign, user, owner info self.agentInfo = initAgentInfo(self.config) self.summaryLevel = config.AnalyticsDataCollector.summaryLevel proxyArgs = {'logger': logging.getLogger(), 'cleanEnvironment': True} self.proxy = Proxy(proxyArgs) self.proxyFile = self.proxy.getProxyFilename() # X509_USER_PROXY self.userCertFile = self.proxy.getUserCertFilename() # X509_USER_CERT # credential lifetime warning/error thresholds, in days self.credThresholds = { 'proxy': { 'error': 3, 'warning': 5 }, 'certificate': { 'error': 10, 'warning': 20 } } # Monitoring setup self.userAMQ = getattr(config.AgentStatusWatcher, "userAMQ", None) self.passAMQ = getattr(config.AgentStatusWatcher, "passAMQ", None) self.postToAMQ = getattr(config.AgentStatusWatcher, "enableAMQ", False) self.topicAMQ = getattr(config.AgentStatusWatcher, "topicAMQ", None) self.hostPortAMQ = getattr(config.AgentStatusWatcher, "hostPortAMQ", [('cms-mb.cern.ch', 61313)]) # T0 doesn't have WorkQueue, so some monitoring/replication code has to be skipped here if hasattr(self.config, "Tier0Feeder"): self.isT0agent = True self.producer = "tier0wmagent" else: self.isT0agent = False self.producer = "wmagent" localWQUrl = config.AnalyticsDataCollector.localQueueURL self.workqueueDS = WorkQueueDS(localWQUrl) def setUpCouchDBReplication(self): self.replicatorDocs = [] # set up common replication code wmstatsSource = self.config.JobStateMachine.jobSummaryDBName wmstatsTarget = self.config.General.centralWMStatsURL self.replicatorDocs.append({ 'source': wmstatsSource, 'target': wmstatsTarget, 'filter': "WMStatsAgent/repfilter" }) if self.isT0agent: t0Source = self.config.Tier0Feeder.requestDBName t0Target = self.config.AnalyticsDataCollector.centralRequestDBURL self.replicatorDocs.append({ 'source': t0Source, 'target': t0Target, 'filter': "T0Request/repfilter" }) else: # set up workqueue replication wqfilter = 'WorkQueue/queueFilter' parentQURL = self.config.WorkQueueManager.queueParams[ "ParentQueueCouchUrl"] childURL = self.config.WorkQueueManager.queueParams["QueueURL"] query_params = { 'childUrl': childURL, 'parentUrl': sanitizeURL(parentQURL)['url'] } localQInboxURL = "%s_inbox" % self.config.AnalyticsDataCollector.localQueueURL self.replicatorDocs.append({ 'source': sanitizeURL(parentQURL)['url'], 'target': localQInboxURL, 'filter': wqfilter, 'query_params': query_params }) self.replicatorDocs.append({ 'source': sanitizeURL(localQInboxURL)['url'], 'target': parentQURL, 'filter': wqfilter, 'query_params': query_params }) # delete old replicator docs before setting up self.localCouchMonitor.deleteReplicatorDocs() for rp in self.replicatorDocs: self.localCouchMonitor.couchServer.replicate(rp['source'], rp['target'], filter=rp['filter'], query_params=rp.get( 'query_params', False), continuous=True) # First cicle need to be skipped since document is not updated that fast self.skipReplicationCheck = True def setup(self, parameters): """ set db connection(couchdb, wmbs) to prepare to gather information """ # interface to WMBS/BossAir db myThread = threading.currentThread() # set wmagent db data self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger) self.centralWMStatsCouchDB = WMStatsWriter( self.config.General.centralWMStatsURL) self.localCouchMonitor = CouchMonitor( self.config.JobStateMachine.couchurl) self.setUpCouchDBReplication() @timeFunction def algorithm(self, parameters): """ get information from wmbs, workqueue and local couch """ try: agentInfo = self.collectAgentInfo() self.checkCredLifetime(agentInfo, "proxy") self.checkCredLifetime(agentInfo, "certificate") timeSpent, wmbsInfo, _ = self.collectWMBSInfo() wmbsInfo['total_query_time'] = int(timeSpent) agentInfo["WMBS_INFO"] = wmbsInfo logging.info("WMBS data collected in: %d secs", timeSpent) if not self.isT0agent: timeSpent, localWQInfo, _ = self.collectWorkQueueInfo() localWQInfo['total_query_time'] = int(timeSpent) agentInfo["LocalWQ_INFO"] = localWQInfo logging.info("Local WorkQueue data collected in: %d secs", timeSpent) self.uploadAgentInfoToCentralWMStats(agentInfo) self.buildMonITDocs(agentInfo) except Exception as ex: logging.exception("Error occurred, will retry later.\nDetails: %s", str(ex)) @timeFunction def collectWorkQueueInfo(self): """ Collect information from local workqueue database :return: """ results = {} wqStates = ['Available', 'Acquired'] results['workByStatus'] = self.workqueueDS.getJobsByStatus() results[ 'workByStatusAndPriority'] = self.workqueueDS.getJobsByStatusAndPriority( ) elements = self.workqueueDS.getElementsByStatus(wqStates) uniSites, posSites = getGlobalSiteStatusSummary(elements, status=wqStates, dataLocality=True) results['uniqueJobsPerSite'] = uniSites results['possibleJobsPerSite'] = posSites return results def collectCouchDBInfo(self): couchInfo = { 'name': 'CouchServer', 'status': 'ok', 'error_message': "" } if self.skipReplicationCheck: # skipping the check this round set if False so it can be checked next round. self.skipReplicationCheck = False return couchInfo for rp in self.replicatorDocs: cInfo = self.localCouchMonitor.checkCouchServerStatus( rp['source'], rp['target'], checkUpdateSeq=False) if cInfo['status'] != 'ok': couchInfo['status'] = 'error' couchInfo['error_message'] = cInfo['error_message'] return couchInfo def collectAgentInfo(self): """ Monitors the general health of the agent, as: 1. status of the agent processes 2. status of the agent threads based on the database info 3. couchdb active tasks and its replications 4. check the disk usage 5. check the number of couch processes :return: a dict with all the info collected """ logging.info("Getting agent info ...") agentInfo = self.wmagentDB.getComponentStatus(self.config) agentInfo.update(self.agentInfo) agentInfo['disk_warning'] = listDiskUsageOverThreshold(self.config, updateDB=True) if isDrainMode(self.config): logging.info("Agent is in DrainMode") agentInfo['drain_mode'] = True agentInfo['drain_stats'] = DrainStatusPoller.getDrainInfo() else: agentInfo['drain_mode'] = False couchInfo = self.collectCouchDBInfo() if couchInfo['status'] != 'ok': agentInfo['down_components'].append(couchInfo['name']) agentInfo['status'] = couchInfo['status'] agentInfo['down_component_detail'].append(couchInfo) # Couch process warning couchProc = numberCouchProcess() logging.info("CouchDB is running with %d processes", couchProc) couchProcessThreshold = self.config.AnalyticsDataCollector.couchProcessThreshold if couchProc >= couchProcessThreshold: agentInfo['couch_process_warning'] = couchProc else: agentInfo['couch_process_warning'] = 0 # Change status if there is data_error, couch process maxed out or disk full problems. if agentInfo['status'] == 'ok' and (agentInfo['drain_mode'] or agentInfo['disk_warning']): agentInfo['status'] = "warning" if agentInfo['status'] == 'ok' or agentInfo['status'] == 'warning': if agentInfo.get('data_error', 'ok') != 'ok' or agentInfo.get( 'couch_process_warning', 0): agentInfo['status'] = "error" logging.info("List of agent components down: %s", agentInfo['down_components']) return agentInfo def uploadAgentInfoToCentralWMStats(self, agentInfo): """ Add some required fields to the document before it can get uploaded to WMStats. :param agentInfo: dict with agent stats to be posted to couchdb """ agentInfo['_id'] = agentInfo["agent_url"] agentInfo['timestamp'] = int(time.time()) agentInfo['type'] = "agent_info" # directly upload to the remote to prevent data conflict when agent is cleaned up and redeployed try: self.centralWMStatsCouchDB.updateAgentInfo( agentInfo, propertiesToKeep=["data_last_update", "data_error"]) except Exception as e: logging.error( "Failed to upload agent statistics to WMStats. Error: %s", str(e)) @timeFunction def collectWMBSInfo(self): """ Fetches WMBS job information. In addition to WMBS, also collects RunJob info from BossAir :return: dict with the number of jobs in each status """ logging.info("Getting wmbs job info ...") results = {} # first retrieve the site thresholds results['thresholds'] = self.wmagentDB.getJobSlotInfo() logging.debug("Running and pending site thresholds: %s", results['thresholds']) # now fetch the amount of jobs in each state and the amount of created # jobs grouped by task results.update(self.wmagentDB.getAgentMonitoring()) logging.debug("Total number of jobs in WMBS sorted by status: %s", results['wmbsCountByState']) logging.debug( "Total number of 'created' jobs in WMBS sorted by type: %s", results['wmbsCreatedTypeCount']) logging.debug( "Total number of 'executing' jobs in WMBS sorted by type: %s", results['wmbsExecutingTypeCount']) logging.debug( "Total number of active jobs in BossAir sorted by status: %s", results['activeRunJobByStatus']) logging.debug( "Total number of complete jobs in BossAir sorted by status: %s", results['completeRunJobByStatus']) logging.debug( "Available slots thresholds to pull work from GQ to LQ: %s", results['thresholdsGQ2LQ']) logging.debug( "List of jobs pending for each site, sorted by priority: %s", results['sitePendCountByPrio']) return results def checkCredLifetime(self, agInfo, credType): """ Check the credential lifetime. Usually X509_USER_PROXY or X509_USER_CERT and raise either a warning or an error if the proxy validity is about to expire. :param agInfo: dictionary with plenty of agent monitoring information in place. :param credType: credential type, can be: "proxy" or "certificate" :return: same dictionary object plus additional keys/values if needed. """ if credType == "proxy": credFile = self.proxyFile secsLeft = self.proxy.getTimeLeft(proxy=credFile) elif credType == "certificate": credFile = self.userCertFile secsLeft = self.proxy.getUserCertTimeLeft(openSSL=True) else: logging.error( "Unknown credential type. Available options are: [proxy, certificate]" ) return logging.debug("%s '%s' lifetime is %d seconds", credType, credFile, secsLeft) daysLeft = secsLeft / (60 * 60 * 24) if daysLeft <= self.credThresholds[credType]['error']: credWarning = True agInfo['status'] = "error" elif daysLeft <= self.credThresholds[credType]['warning']: credWarning = True if agInfo['status'] == "ok": agInfo['status'] = "warning" else: credWarning = False if credWarning: warnMsg = "Agent %s '%s' must be renewed ASAP. " % (credType, credFile) warnMsg += "Its time left is: %.2f hours;" % (secsLeft / 3600.) agInfo['proxy_warning'] = agInfo.get('proxy_warning', "") + warnMsg logging.warning(warnMsg) return def buildMonITDocs(self, dataStats): """ Convert agent statistics into MonIT-friendly documents to be posted to AMQ/ES. It creates 5 different type of documents: * priority information * site information * work information * agent information * agent health information Note that the internal methods are popping some metrics out of dataStats """ if not self.postToAMQ: return logging.info("Preparing documents to be posted to AMQ/MonIT..") allDocs = self._buildMonITPrioDocs(dataStats) allDocs.extend(self._buildMonITSitesDocs(dataStats)) allDocs.extend(self._buildMonITWorkDocs(dataStats)) allDocs.extend(self._buildMonITWMBSDocs(dataStats)) allDocs.extend(self._buildMonITAgentDocs(dataStats)) allDocs.extend(self._buildMonITHealthDocs(dataStats)) allDocs.extend(self._buildMonITSummaryDocs(dataStats)) # and finally post them all to AMQ logging.info("Found %d documents to post to AMQ", len(allDocs)) self.uploadToAMQ(allDocs, dataStats['agent_url'], dataStats['timestamp']) def _buildMonITPrioDocs(self, dataStats): """ Uses the `sitePendCountByPrio` metric in order to build documents reporting the site name, job priority and amount of jobs within that priority. :param dataStats: dictionary with metrics previously posted to WMStats :return: list of dictionaries with the wma_prio_info MonIT docs """ docType = "wma_prio_info" prioDocs = [] sitePendCountByPrio = dataStats['WMBS_INFO'].pop( 'sitePendCountByPrio', []) for site, item in viewitems(sitePendCountByPrio): # it seems sites with no jobs are also always here as "Sitename": {0: 0} if list(item) == [0]: continue for prio, jobs in viewitems(item): prioDoc = {} prioDoc['site_name'] = site prioDoc['type'] = docType prioDoc['priority'] = prio prioDoc['job_count'] = jobs prioDocs.append(prioDoc) return prioDocs def _buildMonITSitesDocs(self, dataStats): """ Uses the site thresholds and job information for each site in order to build a `site_info` document type for MonIT. :param dataStats: dictionary with metrics previously posted to WMStats :return: list of dictionaries with the wma_site_info MonIT docs """ docType = "wma_site_info" siteDocs = [] thresholds = dataStats['WMBS_INFO'].pop('thresholds', {}) thresholdsGQ2LQ = dataStats['WMBS_INFO'].pop('thresholdsGQ2LQ', {}) if self.isT0agent: possibleJobsPerSite = {} uniqueJobsPerSite = {} else: possibleJobsPerSite = dataStats['LocalWQ_INFO'].pop( 'possibleJobsPerSite', {}) uniqueJobsPerSite = dataStats['LocalWQ_INFO'].pop( 'uniqueJobsPerSite', {}) for site in sorted(thresholds): siteDoc = {} siteDoc['site_name'] = site siteDoc['type'] = docType siteDoc['thresholds'] = thresholds[site] siteDoc['state'] = siteDoc['thresholds'].pop('state', 'Unknown') siteDoc['thresholdsGQ2LQ'] = thresholdsGQ2LQ.get(site, 0) for status in possibleJobsPerSite: # make sure these keys are always present in the documents jobKey = "possible_%s_jobs" % status.lower() elemKey = "num_%s_elem" % status.lower() uniJobKey = "unique_%s_jobs" % status.lower() siteDoc[jobKey], siteDoc[elemKey], siteDoc[uniJobKey] = 0, 0, 0 if site in possibleJobsPerSite[status]: siteDoc[jobKey] = possibleJobsPerSite[status][site][ 'sum_jobs'] siteDoc[elemKey] = possibleJobsPerSite[status][site][ 'num_elem'] if site in uniqueJobsPerSite[status]: siteDoc[uniJobKey] = uniqueJobsPerSite[status][site][ 'sum_jobs'] siteDocs.append(siteDoc) return siteDocs def _buildMonITWorkDocs(self, dataStats): """ Uses the local workqueue information order by WQE status and build statistics for the workload in terms of workqueue elements and top level jobs. Using the WMBS data, also builds documents to show the amount of work in 'created' and 'executing' WMBS status. :param dataStats: dictionary with metrics previously posted to WMStats :return: list of dictionaries with the wma_work_info MonIT docs """ workDocs = [] if self.isT0agent: return workDocs docType = "wma_work_info" workByStatus = dataStats['LocalWQ_INFO'].pop('workByStatus', {}) for status, info in viewitems(workByStatus): workDoc = {} workDoc['type'] = docType workDoc['status'] = status workDoc['num_elem'] = info.get('num_elem', 0) workDoc['sum_jobs'] = info.get('sum_jobs', 0) workDocs.append(workDoc) return workDocs def _buildMonITWMBSDocs(self, dataStats): """ Using the WMBS data, builds documents to show the amount of work in 'created' and 'executing' WMBS status. It also builds a document for every single wmbs_status in the database. :param dataStats: dictionary with metrics previously posted to WMStats :return: list of dictionaries with the wma_wmbs_info and wma_wmbs_state_info docs """ docType = "wma_wmbs_info" wmbsDocs = [] wmbsCreatedTypeCount = dataStats['WMBS_INFO'].pop( 'wmbsCreatedTypeCount', {}) wmbsExecutingTypeCount = dataStats['WMBS_INFO'].pop( 'wmbsExecutingTypeCount', {}) for jobType in wmbsCreatedTypeCount: wmbsDoc = {} wmbsDoc['type'] = docType wmbsDoc['job_type'] = jobType wmbsDoc['created_jobs'] = wmbsCreatedTypeCount[jobType] wmbsDoc['executing_jobs'] = wmbsExecutingTypeCount[jobType] wmbsDocs.append(wmbsDoc) docType = "wma_wmbs_state_info" wmbsCountByState = dataStats['WMBS_INFO'].pop('wmbsCountByState', {}) for wmbsStatus in wmbsCountByState: wmbsDoc = {} wmbsDoc['type'] = docType wmbsDoc['wmbs_status'] = wmbsStatus wmbsDoc['num_jobs'] = wmbsCountByState[wmbsStatus] wmbsDocs.append(wmbsDoc) return wmbsDocs def _buildMonITAgentDocs(self, dataStats): """ Uses the BossAir and WMBS table information in order to build a view of amount of jobs in different statuses. :param dataStats: dictionary with metrics previously posted to WMStats :return: list of dictionaries with the wma_agent_info MonIT docs """ docType = "wma_agent_info" agentDocs = [] activeRunJobByStatus = dataStats['WMBS_INFO'].pop( 'activeRunJobByStatus', {}) completeRunJobByStatus = dataStats['WMBS_INFO'].pop( 'completeRunJobByStatus', {}) for schedStatus in activeRunJobByStatus: agentDoc = {} agentDoc['type'] = docType agentDoc['schedd_status'] = schedStatus agentDoc['active_jobs'] = activeRunJobByStatus[schedStatus] agentDoc['completed_jobs'] = completeRunJobByStatus[schedStatus] agentDocs.append(agentDoc) return agentDocs def _buildMonITHealthDocs(self, dataStats): """ Creates documents with specific agent information, status of each component and worker thread (similar to what is shown in wmstats) and also some very basic performance numbers. :param dataStats: dictionary with metrics previously posted to WMStats :return: list of dictionaries with the wma_health_info MonIT docs """ docType = "wma_health_info" healthDocs = [] workersStatus = dataStats.pop('workers', {}) for worker in workersStatus: healthDoc = {} healthDoc['type'] = docType healthDoc['worker_name'] = worker['name'] healthDoc['worker_state'] = worker['state'] healthDoc['worker_poll'] = worker['poll_interval'] healthDoc['worker_last_hb'] = worker['last_updated'] healthDoc['worker_cycle_time'] = worker['cycle_time'] healthDocs.append(healthDoc) return healthDocs def _buildMonITSummaryDocs(self, dataStats): """ Creates a document with the very basic agent info used in the wmstats monitoring tab. :param dataStats: dictionary with metrics previously posted to WMStats :return: list of dictionaries with the wma_health_info MonIT docs """ docType = "wma_summary_info" summaryDocs = [] summaryDoc = {} summaryDoc['type'] = docType summaryDoc['agent_team'] = dataStats['agent_team'] summaryDoc['agent_version'] = dataStats['agent_version'] summaryDoc['agent_status'] = dataStats['status'] if not self.isT0agent: summaryDoc['wq_query_time'] = dataStats['LocalWQ_INFO'][ 'total_query_time'] summaryDoc['wmbs_query_time'] = dataStats['WMBS_INFO'][ 'total_query_time'] summaryDoc['drain_mode'] = dataStats['drain_mode'] summaryDoc['down_components'] = dataStats['down_components'] summaryDocs.append(summaryDoc) return summaryDocs def uploadToAMQ(self, docs, agentUrl, timeS): """ _uploadToAMQ_ Sends data to AMQ, which ends up in the MonIT infrastructure. :param docs: list of documents/dicts to be posted """ if not docs: logging.info("There are no documents to send to AMQ") return # add mandatory information for every single document for doc in docs: doc['agent_url'] = agentUrl docType = "cms_%s_info" % self.producer notifications = [] logging.debug("Sending the following data to AMQ %s", pformat(docs)) try: stompSvc = StompAMQ(username=self.userAMQ, password=self.passAMQ, producer=self.producer, topic=self.topicAMQ, validation_schema=None, host_and_ports=self.hostPortAMQ, logger=logging) for doc in docs: singleNotif, _, _ = stompSvc.make_notification( payload=doc, docType=docType, ts=timeS, dataSubfield="payload") notifications.append(singleNotif) failures = stompSvc.send(notifications) msg = "%i out of %i documents successfully sent to AMQ" % ( len(notifications) - len(failures), len(notifications)) logging.info(msg) except Exception as ex: logging.exception("Failed to send data to StompAMQ. Error %s", str(ex)) return
class AgentStatusPoller(BaseWorkerThread): """ Gether the summary data for request (workflow) from local queue, local job couchdb, wmbs/boss air and populate summary db for monitoring """ def __init__(self, config): """ initialize properties specified from config """ BaseWorkerThread.__init__(self) # set the workqueue service for REST call self.config = config # need to get campaign, user, owner info self.agentInfo = initAgentInfo(self.config) self.summaryLevel = (config.AnalyticsDataCollector.summaryLevel).lower() def setUpCouchDBReplication(self): self.replicatorDocs = [] # set up common replication code wmstatsSource = self.config.JobStateMachine.jobSummaryDBName wmstatsTarget = self.config.AnalyticsDataCollector.centralWMStatsURL self.replicatorDocs.append({'source': wmstatsSource, 'target': wmstatsTarget, 'filter': "WMStatsAgent/repfilter"}) #TODO: tier0 specific code - need to make it generic if hasattr(self.config, "Tier0Feeder"): t0Source = self.config.Tier0Feeder.requestDBName t0Target = self.config.AnalyticsDataCollector.centralRequestDBURL self.replicatorDocs.append({'source': t0Source, 'target': t0Target, 'filter': "T0Request/repfilter"}) else: # set up workqueue replication wqfilter = 'WorkQueue/queueFilter' parentQURL = self.config.WorkQueueManager.queueParams["ParentQueueCouchUrl"] childURL = self.config.WorkQueueManager.queueParams["QueueURL"] query_params = {'childUrl' : childURL, 'parentUrl' : sanitizeURL(parentQURL)['url']} localQInboxURL = "%s_inbox" % self.config.AnalyticsDataCollector.localQueueURL self.replicatorDocs.append({'source': sanitizeURL(parentQURL)['url'], 'target': localQInboxURL, 'filter': wqfilter, 'query_params': query_params}) self.replicatorDocs.append({'source': sanitizeURL(localQInboxURL)['url'], 'target': parentQURL, 'filter': wqfilter, 'query_params': query_params}) # delete or replicator docs befor setting up self.localCouchMonitor.deleteReplicatorDocs() for rp in self.replicatorDocs: self.localCouchMonitor.couchServer.replicate( rp['source'], rp['target'], filter = rp['filter'], query_params = rp.get('query_params', False), continuous = True, useReplicator = True) # First cicle need to be skipped since document is not updated that fast self.skipReplicationCheck = True def setup(self, parameters): """ set db connection(couchdb, wmbs) to prepare to gather information """ # interface to WMBS/BossAir db myThread = threading.currentThread() # set wmagent db data self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger) if hasattr(self.config, "Tier0Feeder"): self.centralWMStatsCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.localWMStatsURL, appName= "WMStatsAgent") else: self.centralWMStatsCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.centralWMStatsURL) self.localCouchMonitor = CouchMonitor(self.config.JobStateMachine.couchurl) self.setUpCouchDBReplication() def algorithm(self, parameters): """ get information from wmbs, workqueue and local couch """ try: logging.info("Getting Agent info ...") agentInfo = self.collectAgentInfo() #set the uploadTime - should be the same for all docs uploadTime = int(time.time()) self.uploadAgentInfoToCentralWMStats(agentInfo, uploadTime) logging.info("Agent components down:\n %s" % agentInfo['down_components']) logging.info("Agent in drain mode:\n %s \nsleep for next WMStats alarm updating cycle" % agentInfo['drain_mode']) except Exception as ex: logging.error("Error occurred, will retry later:") logging.error(str(ex)) logging.error("Trace back: \n%s" % traceback.format_exc()) def collectCouchDBInfo(self): couchInfo = {'status': 'ok', 'error_message': ""} if self.skipReplicationCheck: # skipping the check this round set if False so it can be checked next round. self.skipReplicationCheck = False return couchInfo msg = "" for rp in self.replicatorDocs: cInfo = self.localCouchMonitor.checkCouchServerStatus(rp['source'], rp['target'], checkUpdateSeq = False) if cInfo['status'] != 'ok': couchInfo['status'] = 'error' couchInfo['error_message'] = msg return couchInfo def collectAgentInfo(self): agentInfo = self.wmagentDB.getComponentStatus(self.config) agentInfo.update(self.agentInfo) if isDrainMode(self.config): logging.info("Agent is in DrainMode") agentInfo['drain_mode'] = True agentInfo['status'] = "warning" else: agentInfo['drain_mode'] = False couchInfo = self.collectCouchDBInfo() if (couchInfo['status'] != 'ok'): agentInfo['down_components'].append("CouchServer") agentInfo['status'] = couchInfo['status'] couchInfo['name'] = "CouchServer" agentInfo['down_component_detail'].append(couchInfo) # Disk space warning diskUseList = diskUse() diskUseThreshold = float(self.config.AnalyticsDataCollector.diskUseThreshold) agentInfo['disk_warning'] = [] for disk in diskUseList: if float(disk['percent'].strip('%')) >= diskUseThreshold and disk['mounted'] not in self.config.AnalyticsDataCollector.ignoreDisk: agentInfo['disk_warning'].append(disk) # Couch process warning couchProc = numberCouchProcess() couchProcessThreshold = float(self.config.AnalyticsDataCollector.couchProcessThreshold) if couchProc >= couchProcessThreshold: agentInfo['couch_process_warning'] = couchProc else: agentInfo['couch_process_warning'] = 0 # This adds the last time and message when data was updated to agentInfo lastDataUpload = DataUploadTime.getInfo(self) if lastDataUpload['data_last_update']!=0: agentInfo['data_last_update'] = lastDataUpload['data_last_update'] if lastDataUpload['data_error']!="": agentInfo['data_error'] = lastDataUpload['data_error'] # Change status if there is data_error, couch process maxed out or disk full problems. if agentInfo['status'] == 'ok': if agentInfo['disk_warning'] != []: agentInfo['status'] = "warning" if agentInfo['status'] == 'ok' or agentInfo['status'] == 'warning': if ('data_error' in agentInfo and agentInfo['data_error'] != 'ok') or \ ('couch_process_warning' in agentInfo and agentInfo['couch_process_warning'] != 0): agentInfo['status'] = "error" return agentInfo def uploadAgentInfoToCentralWMStats(self, agentInfo, uploadTime): #direct data upload to the remote to prevent data conflict when agent is cleaned up and redeployed agentDocs = convertToAgentCouchDoc(agentInfo, self.config.ACDC, uploadTime) self.centralWMStatsCouchDB.updateAgentInfo(agentDocs)
class AgentStatusPoller(BaseWorkerThread): """ Gether the summary data for request (workflow) from local queue, local job couchdb, wmbs/boss air and populate summary db for monitoring """ def __init__(self, config): """ initialize properties specified from config """ BaseWorkerThread.__init__(self) # set the workqueue service for REST call self.config = config # need to get campaign, user, owner info self.agentInfo = initAgentInfo(self.config) self.summaryLevel = config.AnalyticsDataCollector.summaryLevel proxyArgs = {'logger': logging.getLogger()} self.proxy = Proxy(proxyArgs) self.proxyFile = self.proxy.getProxyFilename() # X509_USER_PROXY self.userCertFile = self.proxy.getUserCertFilename() # X509_USER_CERT # credential lifetime warning/error thresholds, in days self.credThresholds = {'proxy': {'error': 3, 'warning': 5}, 'certificate': {'error': 10, 'warning': 20}} # Monitoring setup self.userAMQ = getattr(config.AgentStatusWatcher, "userAMQ", None) self.passAMQ = getattr(config.AgentStatusWatcher, "passAMQ", None) self.postToAMQ = getattr(config.AgentStatusWatcher, "enableAMQ", False) self.topicAMQ = getattr(config.AgentStatusWatcher, "topicAMQ", None) self.hostPortAMQ = getattr(config.AgentStatusWatcher, "hostPortAMQ", [('cms-mb.cern.ch', 61313)]) # T0 doesn't have WorkQueue, so some monitoring/replication code has to be skipped here if hasattr(self.config, "Tier0Feeder"): self.isT0agent = True self.producer = "tier0wmagent" else: self.isT0agent = False self.producer = "wmagent" localWQUrl = config.AnalyticsDataCollector.localQueueURL self.workqueueDS = WorkQueueDS(localWQUrl) def setUpCouchDBReplication(self): self.replicatorDocs = [] # set up common replication code wmstatsSource = self.config.JobStateMachine.jobSummaryDBName wmstatsTarget = self.config.General.centralWMStatsURL self.replicatorDocs.append({'source': wmstatsSource, 'target': wmstatsTarget, 'filter': "WMStatsAgent/repfilter"}) if self.isT0agent: t0Source = self.config.Tier0Feeder.requestDBName t0Target = self.config.AnalyticsDataCollector.centralRequestDBURL self.replicatorDocs.append({'source': t0Source, 'target': t0Target, 'filter': "T0Request/repfilter"}) else: # set up workqueue replication wqfilter = 'WorkQueue/queueFilter' parentQURL = self.config.WorkQueueManager.queueParams["ParentQueueCouchUrl"] childURL = self.config.WorkQueueManager.queueParams["QueueURL"] query_params = {'childUrl': childURL, 'parentUrl': sanitizeURL(parentQURL)['url']} localQInboxURL = "%s_inbox" % self.config.AnalyticsDataCollector.localQueueURL self.replicatorDocs.append({'source': sanitizeURL(parentQURL)['url'], 'target': localQInboxURL, 'filter': wqfilter, 'query_params': query_params}) self.replicatorDocs.append({'source': sanitizeURL(localQInboxURL)['url'], 'target': parentQURL, 'filter': wqfilter, 'query_params': query_params}) # delete old replicator docs before setting up self.localCouchMonitor.deleteReplicatorDocs() for rp in self.replicatorDocs: self.localCouchMonitor.couchServer.replicate( rp['source'], rp['target'], filter=rp['filter'], query_params=rp.get('query_params', False), continuous=True) # First cicle need to be skipped since document is not updated that fast self.skipReplicationCheck = True def setup(self, parameters): """ set db connection(couchdb, wmbs) to prepare to gather information """ # interface to WMBS/BossAir db myThread = threading.currentThread() # set wmagent db data self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger) self.centralWMStatsCouchDB = WMStatsWriter(self.config.General.centralWMStatsURL) self.localCouchMonitor = CouchMonitor(self.config.JobStateMachine.couchurl) self.setUpCouchDBReplication() @timeFunction def algorithm(self, parameters): """ get information from wmbs, workqueue and local couch """ try: agentInfo = self.collectAgentInfo() self.checkCredLifetime(agentInfo, "proxy") self.checkCredLifetime(agentInfo, "certificate") timeSpent, wmbsInfo, _ = self.collectWMBSInfo() wmbsInfo['total_query_time'] = int(timeSpent) agentInfo["WMBS_INFO"] = wmbsInfo logging.info("WMBS data collected in: %d secs", timeSpent) if not self.isT0agent: timeSpent, localWQInfo, _ = self.collectWorkQueueInfo() localWQInfo['total_query_time'] = int(timeSpent) agentInfo["LocalWQ_INFO"] = localWQInfo logging.info("Local WorkQueue data collected in: %d secs", timeSpent) self.uploadAgentInfoToCentralWMStats(agentInfo) self.buildMonITDocs(agentInfo) except Exception as ex: logging.exception("Error occurred, will retry later.\nDetails: %s", str(ex)) @timeFunction def collectWorkQueueInfo(self): """ Collect information from local workqueue database :return: """ results = {} wqStates = ['Available', 'Acquired'] results['workByStatus'] = self.workqueueDS.getJobsByStatus() results['workByStatusAndPriority'] = self.workqueueDS.getJobsByStatusAndPriority() elements = self.workqueueDS.getElementsByStatus(wqStates) uniSites, posSites = getGlobalSiteStatusSummary(elements, status=wqStates, dataLocality=True) results['uniqueJobsPerSite'] = uniSites results['possibleJobsPerSite'] = posSites return results def collectCouchDBInfo(self): couchInfo = {'name': 'CouchServer', 'status': 'ok', 'error_message': ""} if self.skipReplicationCheck: # skipping the check this round set if False so it can be checked next round. self.skipReplicationCheck = False return couchInfo for rp in self.replicatorDocs: cInfo = self.localCouchMonitor.checkCouchServerStatus(rp['source'], rp['target'], checkUpdateSeq=False) if cInfo['status'] != 'ok': couchInfo['status'] = 'error' couchInfo['error_message'] = cInfo['error_message'] return couchInfo def collectAgentInfo(self): """ Monitors the general health of the agent, as: 1. status of the agent processes 2. status of the agent threads based on the database info 3. couchdb active tasks and its replications 4. check the disk usage 5. check the number of couch processes :return: a dict with all the info collected """ logging.info("Getting agent info ...") agentInfo = self.wmagentDB.getComponentStatus(self.config) agentInfo.update(self.agentInfo) agentInfo['disk_warning'] = listDiskUsageOverThreshold(self.config, updateDB=True) if isDrainMode(self.config): logging.info("Agent is in DrainMode") agentInfo['drain_mode'] = True agentInfo['drain_stats'] = DrainStatusPoller.getDrainInfo() else: agentInfo['drain_mode'] = False couchInfo = self.collectCouchDBInfo() if couchInfo['status'] != 'ok': agentInfo['down_components'].append(couchInfo['name']) agentInfo['status'] = couchInfo['status'] agentInfo['down_component_detail'].append(couchInfo) # Couch process warning couchProc = numberCouchProcess() logging.info("CouchDB is running with %d processes", couchProc) couchProcessThreshold = self.config.AnalyticsDataCollector.couchProcessThreshold if couchProc >= couchProcessThreshold: agentInfo['couch_process_warning'] = couchProc else: agentInfo['couch_process_warning'] = 0 # Change status if there is data_error, couch process maxed out or disk full problems. if agentInfo['status'] == 'ok' and (agentInfo['drain_mode'] or agentInfo['disk_warning']): agentInfo['status'] = "warning" if agentInfo['status'] == 'ok' or agentInfo['status'] == 'warning': if agentInfo.get('data_error', 'ok') != 'ok' or agentInfo.get('couch_process_warning', 0): agentInfo['status'] = "error" logging.info("List of agent components down: %s", agentInfo['down_components']) return agentInfo def uploadAgentInfoToCentralWMStats(self, agentInfo): """ Add some required fields to the document before it can get uploaded to WMStats. :param agentInfo: dict with agent stats to be posted to couchdb """ agentInfo['_id'] = agentInfo["agent_url"] agentInfo['timestamp'] = int(time.time()) agentInfo['type'] = "agent_info" # directly upload to the remote to prevent data conflict when agent is cleaned up and redeployed try: self.centralWMStatsCouchDB.updateAgentInfo(agentInfo, propertiesToKeep=["data_last_update", "data_error"]) except Exception as e: logging.error("Failed to upload agent statistics to WMStats. Error: %s", str(e)) @timeFunction def collectWMBSInfo(self): """ Fetches WMBS job information. In addition to WMBS, also collects RunJob info from BossAir :return: dict with the number of jobs in each status """ logging.info("Getting wmbs job info ...") results = {} # first retrieve the site thresholds results['thresholds'] = self.wmagentDB.getJobSlotInfo() logging.debug("Running and pending site thresholds: %s", results['thresholds']) # now fetch the amount of jobs in each state and the amount of created # jobs grouped by task results.update(self.wmagentDB.getAgentMonitoring()) logging.debug("Total number of jobs in WMBS sorted by status: %s", results['wmbsCountByState']) logging.debug("Total number of 'created' jobs in WMBS sorted by type: %s", results['wmbsCreatedTypeCount']) logging.debug("Total number of 'executing' jobs in WMBS sorted by type: %s", results['wmbsExecutingTypeCount']) logging.debug("Total number of active jobs in BossAir sorted by status: %s", results['activeRunJobByStatus']) logging.debug("Total number of complete jobs in BossAir sorted by status: %s", results['completeRunJobByStatus']) logging.debug("Available slots thresholds to pull work from GQ to LQ: %s", results['thresholdsGQ2LQ']) logging.debug("List of jobs pending for each site, sorted by priority: %s", results['sitePendCountByPrio']) return results def checkCredLifetime(self, agInfo, credType): """ Check the credential lifetime. Usually X509_USER_PROXY or X509_USER_CERT and raise either a warning or an error if the proxy validity is about to expire. :param agInfo: dictionary with plenty of agent monitoring information in place. :param credType: credential type, can be: "proxy" or "certificate" :return: same dictionary object plus additional keys/values if needed. """ if credType == "proxy": credFile = self.proxyFile secsLeft = self.proxy.getTimeLeft(proxy=credFile) elif credType == "certificate": credFile = self.userCertFile secsLeft = self.proxy.getUserCertTimeLeft(openSSL=True) else: logging.error("Unknown credential type. Available options are: [proxy, certificate]") return logging.debug("%s '%s' lifetime is %d seconds", credType, credFile, secsLeft) daysLeft = secsLeft / (60. * 60 * 24) if daysLeft <= self.credThresholds[credType]['error']: credWarning = True agInfo['status'] = "error" elif daysLeft <= self.credThresholds[credType]['warning']: credWarning = True if agInfo['status'] == "ok": agInfo['status'] = "warning" else: credWarning = False if credWarning: warnMsg = "Agent %s '%s' must be renewed ASAP. " % (credType, credFile) warnMsg += "Its time left is: %.2f hours;" % (secsLeft / 3600.) agInfo['proxy_warning'] = agInfo.get('proxy_warning', "") + warnMsg logging.warning(warnMsg) return def buildMonITDocs(self, dataStats): """ Convert agent statistics into MonIT-friendly documents to be posted to AMQ/ES. It creates 5 different type of documents: * priority information * site information * work information * agent information * agent health information Note that the internal methods are popping some metrics out of dataStats """ if not self.postToAMQ: return logging.info("Preparing documents to be posted to AMQ/MonIT..") allDocs = self._buildMonITPrioDocs(dataStats) allDocs.extend(self._buildMonITSitesDocs(dataStats)) allDocs.extend(self._buildMonITWorkDocs(dataStats)) allDocs.extend(self._buildMonITWMBSDocs(dataStats)) allDocs.extend(self._buildMonITAgentDocs(dataStats)) allDocs.extend(self._buildMonITHealthDocs(dataStats)) allDocs.extend(self._buildMonITSummaryDocs(dataStats)) # and finally post them all to AMQ logging.info("Found %d documents to post to AMQ", len(allDocs)) self.uploadToAMQ(allDocs, dataStats['agent_url'], dataStats['timestamp']) def _buildMonITPrioDocs(self, dataStats): """ Uses the `sitePendCountByPrio` metric in order to build documents reporting the site name, job priority and amount of jobs within that priority. :param dataStats: dictionary with metrics previously posted to WMStats :return: list of dictionaries with the wma_prio_info MonIT docs """ docType = "wma_prio_info" prioDocs = [] sitePendCountByPrio = dataStats['WMBS_INFO'].pop('sitePendCountByPrio', []) for site, item in sitePendCountByPrio.iteritems(): # it seems sites with no jobs are also always here as "Sitename": {0: 0} if item.keys() == [0]: continue for prio, jobs in item.iteritems(): prioDoc = {} prioDoc['site_name'] = site prioDoc['type'] = docType prioDoc['priority'] = prio prioDoc['job_count'] = jobs prioDocs.append(prioDoc) return prioDocs def _buildMonITSitesDocs(self, dataStats): """ Uses the site thresholds and job information for each site in order to build a `site_info` document type for MonIT. :param dataStats: dictionary with metrics previously posted to WMStats :return: list of dictionaries with the wma_site_info MonIT docs """ docType = "wma_site_info" siteDocs = [] thresholds = dataStats['WMBS_INFO'].pop('thresholds', {}) thresholdsGQ2LQ = dataStats['WMBS_INFO'].pop('thresholdsGQ2LQ', {}) if self.isT0agent: possibleJobsPerSite = {} uniqueJobsPerSite = {} else: possibleJobsPerSite = dataStats['LocalWQ_INFO'].pop('possibleJobsPerSite', {}) uniqueJobsPerSite = dataStats['LocalWQ_INFO'].pop('uniqueJobsPerSite', {}) for site in sorted(thresholds): siteDoc = {} siteDoc['site_name'] = site siteDoc['type'] = docType siteDoc['thresholds'] = thresholds[site] siteDoc['state'] = siteDoc['thresholds'].pop('state', 'Unknown') siteDoc['thresholdsGQ2LQ'] = thresholdsGQ2LQ.get(site, 0) for status in possibleJobsPerSite.keys(): # make sure these keys are always present in the documents jobKey = "possible_%s_jobs" % status.lower() elemKey = "num_%s_elem" % status.lower() uniJobKey = "unique_%s_jobs" % status.lower() siteDoc[jobKey], siteDoc[elemKey], siteDoc[uniJobKey] = 0, 0, 0 if site in possibleJobsPerSite[status]: siteDoc[jobKey] = possibleJobsPerSite[status][site]['sum_jobs'] siteDoc[elemKey] = possibleJobsPerSite[status][site]['num_elem'] if site in uniqueJobsPerSite[status]: siteDoc[uniJobKey] = uniqueJobsPerSite[status][site]['sum_jobs'] siteDocs.append(siteDoc) return siteDocs def _buildMonITWorkDocs(self, dataStats): """ Uses the local workqueue information order by WQE status and build statistics for the workload in terms of workqueue elements and top level jobs. Using the WMBS data, also builds documents to show the amount of work in 'created' and 'executing' WMBS status. :param dataStats: dictionary with metrics previously posted to WMStats :return: list of dictionaries with the wma_work_info MonIT docs """ workDocs = [] if self.isT0agent: return workDocs docType = "wma_work_info" workByStatus = dataStats['LocalWQ_INFO'].pop('workByStatus', {}) for status, info in workByStatus.items(): workDoc = {} workDoc['type'] = docType workDoc['status'] = status workDoc['num_elem'] = info.get('num_elem', 0) workDoc['sum_jobs'] = info.get('sum_jobs', 0) workDocs.append(workDoc) return workDocs def _buildMonITWMBSDocs(self, dataStats): """ Using the WMBS data, builds documents to show the amount of work in 'created' and 'executing' WMBS status. It also builds a document for every single wmbs_status in the database. :param dataStats: dictionary with metrics previously posted to WMStats :return: list of dictionaries with the wma_wmbs_info and wma_wmbs_state_info docs """ docType = "wma_wmbs_info" wmbsDocs = [] wmbsCreatedTypeCount = dataStats['WMBS_INFO'].pop('wmbsCreatedTypeCount', {}) wmbsExecutingTypeCount = dataStats['WMBS_INFO'].pop('wmbsExecutingTypeCount', {}) for jobType in wmbsCreatedTypeCount: wmbsDoc = {} wmbsDoc['type'] = docType wmbsDoc['job_type'] = jobType wmbsDoc['created_jobs'] = wmbsCreatedTypeCount[jobType] wmbsDoc['executing_jobs'] = wmbsExecutingTypeCount[jobType] wmbsDocs.append(wmbsDoc) docType = "wma_wmbs_state_info" wmbsCountByState = dataStats['WMBS_INFO'].pop('wmbsCountByState', {}) for wmbsStatus in wmbsCountByState: wmbsDoc = {} wmbsDoc['type'] = docType wmbsDoc['wmbs_status'] = wmbsStatus wmbsDoc['num_jobs'] = wmbsCountByState[wmbsStatus] wmbsDocs.append(wmbsDoc) return wmbsDocs def _buildMonITAgentDocs(self, dataStats): """ Uses the BossAir and WMBS table information in order to build a view of amount of jobs in different statuses. :param dataStats: dictionary with metrics previously posted to WMStats :return: list of dictionaries with the wma_agent_info MonIT docs """ docType = "wma_agent_info" agentDocs = [] activeRunJobByStatus = dataStats['WMBS_INFO'].pop('activeRunJobByStatus', {}) completeRunJobByStatus = dataStats['WMBS_INFO'].pop('completeRunJobByStatus', {}) for schedStatus in activeRunJobByStatus: agentDoc = {} agentDoc['type'] = docType agentDoc['schedd_status'] = schedStatus agentDoc['active_jobs'] = activeRunJobByStatus[schedStatus] agentDoc['completed_jobs'] = completeRunJobByStatus[schedStatus] agentDocs.append(agentDoc) return agentDocs def _buildMonITHealthDocs(self, dataStats): """ Creates documents with specific agent information, status of each component and worker thread (similar to what is shown in wmstats) and also some very basic performance numbers. :param dataStats: dictionary with metrics previously posted to WMStats :return: list of dictionaries with the wma_health_info MonIT docs """ docType = "wma_health_info" healthDocs = [] workersStatus = dataStats.pop('workers', {}) for worker in workersStatus: healthDoc = {} healthDoc['type'] = docType healthDoc['worker_name'] = worker['name'] healthDoc['worker_state'] = worker['state'] healthDoc['worker_poll'] = worker['poll_interval'] healthDoc['worker_last_hb'] = worker['last_updated'] healthDoc['worker_cycle_time'] = worker['cycle_time'] healthDocs.append(healthDoc) return healthDocs def _buildMonITSummaryDocs(self, dataStats): """ Creates a document with the very basic agent info used in the wmstats monitoring tab. :param dataStats: dictionary with metrics previously posted to WMStats :return: list of dictionaries with the wma_health_info MonIT docs """ docType = "wma_summary_info" summaryDocs = [] summaryDoc = {} summaryDoc['type'] = docType summaryDoc['agent_team'] = dataStats['agent_team'] summaryDoc['agent_version'] = dataStats['agent_version'] summaryDoc['agent_status'] = dataStats['status'] if not self.isT0agent: summaryDoc['wq_query_time'] = dataStats['LocalWQ_INFO']['total_query_time'] summaryDoc['wmbs_query_time'] = dataStats['WMBS_INFO']['total_query_time'] summaryDoc['drain_mode'] = dataStats['drain_mode'] summaryDoc['down_components'] = dataStats['down_components'] summaryDocs.append(summaryDoc) return summaryDocs def uploadToAMQ(self, docs, agentUrl, timeS): """ _uploadToAMQ_ Sends data to AMQ, which ends up in the MonIT infrastructure. :param docs: list of documents/dicts to be posted """ if not docs: logging.info("There are no documents to send to AMQ") return # add mandatory information for every single document for doc in docs: doc['agent_url'] = agentUrl docType = "cms_%s_info" % self.producer logging.debug("Sending the following data to AMQ %s", pformat(docs)) try: stompSvc = StompAMQ(username=self.userAMQ, password=self.passAMQ, producer=self.producer, topic=self.topicAMQ, host_and_ports=self.hostPortAMQ, logger=logging) notifications = [stompSvc.make_notification(payload=doc, docType=docType, ts=timeS, dataSubfield="payload") for doc in docs] failures = stompSvc.send(notifications) logging.info("%i docs successfully sent to AMQ", len(notifications) - len(failures)) except Exception as ex: logging.exception("Failed to send data to StompAMQ. Error %s", str(ex)) return
class AgentStatusPoller(BaseWorkerThread): """ Gether the summary data for request (workflow) from local queue, local job couchdb, wmbs/boss air and populate summary db for monitoring """ def __init__(self, config): """ initialize properties specified from config """ BaseWorkerThread.__init__(self) # set the workqueue service for REST call self.config = config # need to get campaign, user, owner info self.agentInfo = initAgentInfo(self.config) self.summaryLevel = ( config.AnalyticsDataCollector.summaryLevel).lower() def setUpCouchDBReplication(self): self.replicatorDocs = [] # set up common replication code wmstatsSource = self.config.JobStateMachine.jobSummaryDBName wmstatsTarget = self.config.AnalyticsDataCollector.centralWMStatsURL self.replicatorDocs.append({ 'source': wmstatsSource, 'target': wmstatsTarget, 'filter': "WMStatsAgent/repfilter" }) #TODO: tier0 specific code - need to make it generic if hasattr(self.config, "Tier0Feeder"): t0Source = self.config.Tier0Feeder.requestDBName t0Target = self.config.AnalyticsDataCollector.centralRequestDBURL self.replicatorDocs.append({ 'source': t0Source, 'target': t0Target, 'filter': "T0Request/repfilter" }) else: # set up workqueue replication wqfilter = 'WorkQueue/queueFilter' parentQURL = self.config.WorkQueueManager.queueParams[ "ParentQueueCouchUrl"] childURL = self.config.WorkQueueManager.queueParams["QueueURL"] query_params = { 'childUrl': childURL, 'parentUrl': sanitizeURL(parentQURL)['url'] } localQInboxURL = "%s_inbox" % self.config.AnalyticsDataCollector.localQueueURL self.replicatorDocs.append({ 'source': sanitizeURL(parentQURL)['url'], 'target': localQInboxURL, 'filter': wqfilter, 'query_params': query_params }) self.replicatorDocs.append({ 'source': sanitizeURL(localQInboxURL)['url'], 'target': parentQURL, 'filter': wqfilter, 'query_params': query_params }) # delete or replicator docs befor setting up self.localCouchMonitor.deleteReplicatorDocs() for rp in self.replicatorDocs: self.localCouchMonitor.couchServer.replicate(rp['source'], rp['target'], filter=rp['filter'], query_params=rp.get( 'query_params', False), continuous=True) # First cicle need to be skipped since document is not updated that fast self.skipReplicationCheck = True def setup(self, parameters): """ set db connection(couchdb, wmbs) to prepare to gather information """ # interface to WMBS/BossAir db myThread = threading.currentThread() # set wmagent db data self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger) if hasattr(self.config, "Tier0Feeder"): self.centralWMStatsCouchDB = WMStatsWriter( self.config.AnalyticsDataCollector.localWMStatsURL, appName="WMStatsAgent") else: self.centralWMStatsCouchDB = WMStatsWriter( self.config.AnalyticsDataCollector.centralWMStatsURL) self.localCouchMonitor = CouchMonitor( self.config.JobStateMachine.couchurl) self.setUpCouchDBReplication() def algorithm(self, parameters): """ get information from wmbs, workqueue and local couch """ try: logging.info("Getting Agent info ...") agentInfo = self.collectAgentInfo() #set the uploadTime - should be the same for all docs uploadTime = int(time.time()) self.uploadAgentInfoToCentralWMStats(agentInfo, uploadTime) logging.info("Agent components down:\n %s" % agentInfo['down_components']) logging.info( "Agent in drain mode:\n %s \nsleep for next WMStats alarm updating cycle" % agentInfo['drain_mode']) except Exception as ex: logging.error("Error occurred, will retry later:") logging.error(str(ex)) logging.error("Trace back: \n%s" % traceback.format_exc()) def collectCouchDBInfo(self): couchInfo = {'status': 'ok', 'error_message': ""} if self.skipReplicationCheck: # skipping the check this round set if False so it can be checked next round. self.skipReplicationCheck = False return couchInfo msg = "" for rp in self.replicatorDocs: cInfo = self.localCouchMonitor.checkCouchServerStatus( rp['source'], rp['target'], checkUpdateSeq=False) if cInfo['status'] != 'ok': couchInfo['status'] = 'error' couchInfo['error_message'] = msg return couchInfo def collectAgentInfo(self): agentInfo = self.wmagentDB.getComponentStatus(self.config) agentInfo.update(self.agentInfo) if isDrainMode(self.config): logging.info("Agent is in DrainMode") agentInfo['drain_mode'] = True agentInfo['status'] = "warning" else: agentInfo['drain_mode'] = False couchInfo = self.collectCouchDBInfo() if (couchInfo['status'] != 'ok'): agentInfo['down_components'].append("CouchServer") agentInfo['status'] = couchInfo['status'] couchInfo['name'] = "CouchServer" agentInfo['down_component_detail'].append(couchInfo) # Disk space warning diskUseList = diskUse() diskUseThreshold = float( self.config.AnalyticsDataCollector.diskUseThreshold) agentInfo['disk_warning'] = [] for disk in diskUseList: if float(disk['percent'].strip('%')) >= diskUseThreshold and disk[ 'mounted'] not in self.config.AnalyticsDataCollector.ignoreDisk: agentInfo['disk_warning'].append(disk) # Couch process warning couchProc = numberCouchProcess() couchProcessThreshold = float( self.config.AnalyticsDataCollector.couchProcessThreshold) if couchProc >= couchProcessThreshold: agentInfo['couch_process_warning'] = couchProc else: agentInfo['couch_process_warning'] = 0 # This adds the last time and message when data was updated to agentInfo lastDataUpload = DataUploadTime.getInfo(self) if lastDataUpload['data_last_update'] != 0: agentInfo['data_last_update'] = lastDataUpload['data_last_update'] if lastDataUpload['data_error'] != "": agentInfo['data_error'] = lastDataUpload['data_error'] # Change status if there is data_error, couch process maxed out or disk full problems. if agentInfo['status'] == 'ok': if agentInfo['disk_warning'] != []: agentInfo['status'] = "warning" if agentInfo['status'] == 'ok' or agentInfo['status'] == 'warning': if ('data_error' in agentInfo and agentInfo['data_error'] != 'ok') or \ ('couch_process_warning' in agentInfo and agentInfo['couch_process_warning'] != 0): agentInfo['status'] = "error" return agentInfo def uploadAgentInfoToCentralWMStats(self, agentInfo, uploadTime): #direct data upload to the remote to prevent data conflict when agent is cleaned up and redeployed agentDocs = convertToAgentCouchDoc(agentInfo, self.config.ACDC, uploadTime) self.centralWMStatsCouchDB.updateAgentInfo(agentDocs)
class AgentStatusPoller(BaseWorkerThread): """ Gether the summary data for request (workflow) from local queue, local job couchdb, wmbs/boss air and populate summary db for monitoring """ def __init__(self, config): """ initialize properties specified from config """ BaseWorkerThread.__init__(self) # set the workqueue service for REST call self.config = config # need to get campaign, user, owner info self.agentInfo = initAgentInfo(self.config) self.summaryLevel = config.AnalyticsDataCollector.summaryLevel self.jsonFile = config.AgentStatusWatcher.jsonFile proxyArgs = {'logger': logging.getLogger()} self.proxy = Proxy(proxyArgs) self.proxyFile = self.proxy.getProxyFilename() # X509_USER_PROXY localWQUrl = config.AnalyticsDataCollector.localQueueURL self.workqueueDS = WorkQueueDS(localWQUrl) def setUpCouchDBReplication(self): self.replicatorDocs = [] # set up common replication code wmstatsSource = self.config.JobStateMachine.jobSummaryDBName wmstatsTarget = self.config.AnalyticsDataCollector.centralWMStatsURL self.replicatorDocs.append({ 'source': wmstatsSource, 'target': wmstatsTarget, 'filter': "WMStatsAgent/repfilter" }) # TODO: tier0 specific code - need to make it generic if hasattr(self.config, "Tier0Feeder"): t0Source = self.config.Tier0Feeder.requestDBName t0Target = self.config.AnalyticsDataCollector.centralRequestDBURL self.replicatorDocs.append({ 'source': t0Source, 'target': t0Target, 'filter': "T0Request/repfilter" }) else: # set up workqueue replication wqfilter = 'WorkQueue/queueFilter' parentQURL = self.config.WorkQueueManager.queueParams[ "ParentQueueCouchUrl"] childURL = self.config.WorkQueueManager.queueParams["QueueURL"] query_params = { 'childUrl': childURL, 'parentUrl': sanitizeURL(parentQURL)['url'] } localQInboxURL = "%s_inbox" % self.config.AnalyticsDataCollector.localQueueURL self.replicatorDocs.append({ 'source': sanitizeURL(parentQURL)['url'], 'target': localQInboxURL, 'filter': wqfilter, 'query_params': query_params }) self.replicatorDocs.append({ 'source': sanitizeURL(localQInboxURL)['url'], 'target': parentQURL, 'filter': wqfilter, 'query_params': query_params }) # delete old replicator docs before setting up self.localCouchMonitor.deleteReplicatorDocs() for rp in self.replicatorDocs: self.localCouchMonitor.couchServer.replicate(rp['source'], rp['target'], filter=rp['filter'], query_params=rp.get( 'query_params', False), continuous=True) # First cicle need to be skipped since document is not updated that fast self.skipReplicationCheck = True def setup(self, parameters): """ set db connection(couchdb, wmbs) to prepare to gather information """ # interface to WMBS/BossAir db myThread = threading.currentThread() # set wmagent db data self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger) self.centralWMStatsCouchDB = WMStatsWriter( self.config.AnalyticsDataCollector.centralWMStatsURL) self.localCouchMonitor = CouchMonitor( self.config.JobStateMachine.couchurl) self.setUpCouchDBReplication() def algorithm(self, parameters): """ get information from wmbs, workqueue and local couch """ try: agentInfo = self.collectAgentInfo() self.checkProxyLifetime(agentInfo) timeSpent, wmbsInfo, _ = self.collectWMBSInfo() wmbsInfo['total_query_time'] = int(timeSpent) agentInfo["WMBS_INFO"] = wmbsInfo logging.info("WMBS data collected in: %d secs", timeSpent) if not hasattr(self.config, "Tier0Feeder"): # Tier0 Agent doesn't have LQ. timeSpent, localWQInfo, _ = self.collectWorkQueueInfo() localWQInfo['total_query_time'] = int(timeSpent) agentInfo["LocalWQ_INFO"] = localWQInfo logging.info("Local WorkQueue data collected in: %d secs", timeSpent) uploadTime = int(time.time()) self.uploadAgentInfoToCentralWMStats(agentInfo, uploadTime) # save locally json file as well with open(self.jsonFile, 'w') as outFile: json.dump(agentInfo, outFile, indent=2) except Exception as ex: logging.exception("Error occurred, will retry later.\nDetails: %s", str(ex)) @timeFunction def collectWorkQueueInfo(self): """ Collect information from local workqueue database :return: """ results = {} results['workByStatus'] = self.workqueueDS.getJobsByStatus() results[ 'workByStatusAndPriority'] = self.workqueueDS.getJobsByStatusAndPriority( ) elements = self.workqueueDS.getElementsByStatus( ['Available', 'Acquired']) uniSites, posSites = getGlobalSiteStatusSummary(elements, dataLocality=True) results['uniqueJobsPerSite'] = uniSites results['possibleJobsPerSite'] = posSites return results def collectCouchDBInfo(self): couchInfo = { 'name': 'CouchServer', 'status': 'ok', 'error_message': "" } if self.skipReplicationCheck: # skipping the check this round set if False so it can be checked next round. self.skipReplicationCheck = False return couchInfo for rp in self.replicatorDocs: cInfo = self.localCouchMonitor.checkCouchServerStatus( rp['source'], rp['target'], checkUpdateSeq=False) if cInfo['status'] != 'ok': couchInfo['status'] = 'error' couchInfo['error_message'] = cInfo['error_message'] return couchInfo def collectAgentInfo(self): """ Monitors the general health of the agent, as: 1. status of the agent processes 2. status of the agent threads based on the database info 3. couchdb active tasks and its replications 4. check the disk usage 5. check the number of couch processes :return: a dict with all the info collected """ logging.info("Getting agent info ...") agentInfo = self.wmagentDB.getComponentStatus(self.config) agentInfo.update(self.agentInfo) agentInfo['disk_warning'] = listDiskUsageOverThreshold(self.config, updateDB=True) if isDrainMode(self.config): logging.info("Agent is in DrainMode") agentInfo['drain_mode'] = True agentInfo['drain_stats'] = DrainStatusPoller.getDrainInfo() else: agentInfo['drain_mode'] = False couchInfo = self.collectCouchDBInfo() if couchInfo['status'] != 'ok': agentInfo['down_components'].append(couchInfo['name']) agentInfo['status'] = couchInfo['status'] agentInfo['down_component_detail'].append(couchInfo) # Couch process warning couchProc = numberCouchProcess() logging.info("CouchDB is running with %d processes", couchProc) couchProcessThreshold = self.config.AnalyticsDataCollector.couchProcessThreshold if couchProc >= couchProcessThreshold: agentInfo['couch_process_warning'] = couchProc else: agentInfo['couch_process_warning'] = 0 # This adds the last time and message when data was updated to agentInfo lastDataUpload = DataUploadTime.getInfo() if lastDataUpload['data_last_update']: agentInfo['data_last_update'] = lastDataUpload['data_last_update'] if lastDataUpload['data_error']: agentInfo['data_error'] = lastDataUpload['data_error'] # Change status if there is data_error, couch process maxed out or disk full problems. if agentInfo['status'] == 'ok' and (agentInfo['drain_mode'] or agentInfo['disk_warning']): agentInfo['status'] = "warning" if agentInfo['status'] == 'ok' or agentInfo['status'] == 'warning': if agentInfo.get('data_error', 'ok') != 'ok' or agentInfo.get( 'couch_process_warning', 0): agentInfo['status'] = "error" logging.info("List of agent components down: %s", agentInfo['down_components']) return agentInfo def uploadAgentInfoToCentralWMStats(self, agentInfo, uploadTime): # direct data upload to the remote to prevent data conflict when agent is cleaned up and redeployed agentDocs = convertToAgentCouchDoc(agentInfo, self.config.ACDC, uploadTime) self.centralWMStatsCouchDB.updateAgentInfo(agentDocs) @timeFunction def collectWMBSInfo(self): """ Fetches WMBS job information. In addition to WMBS, also collects RunJob info from BossAir :return: dict with the number of jobs in each status """ logging.info("Getting wmbs job info ...") results = {} # first retrieve the site thresholds results['thresholds'] = self.wmagentDB.getJobSlotInfo() logging.debug("Running and pending site thresholds: %s", results['thresholds']) # now fetch the amount of jobs in each state and the amount of created # jobs grouped by task results.update(self.wmagentDB.getAgentMonitoring()) logging.debug("Total number of jobs in WMBS sorted by status: %s", results['wmbsCountByState']) logging.debug( "Total number of 'created' jobs in WMBS sorted by type: %s", results['wmbsCreatedTypeCount']) logging.debug( "Total number of 'executing' jobs in WMBS sorted by type: %s", results['wmbsExecutingTypeCount']) logging.debug( "Total number of active jobs in BossAir sorted by status: %s", results['activeRunJobByStatus']) logging.debug( "Total number of complete jobs in BossAir sorted by status: %s", results['completeRunJobByStatus']) logging.debug( "Available slots thresholds to pull work from GQ to LQ: %s", results['thresholdsGQ2LQ']) logging.debug( "List of jobs pending for each site, sorted by priority: %s", results['sitePendCountByPrio']) return results def checkProxyLifetime(self, agInfo): """ Check the proxy lifetime (usually X509_USER_CERT) and raise either a warning or an error if the proxy validity is about to expire. :param agInfo: dictionary with plenty of agent monitoring information in place. :return: same dictionary object plus additional keys/values if needed. """ secsLeft = self.proxy.getTimeLeft(proxy=self.proxyFile) logging.debug("Proxy '%s' lifetime is %d secs", self.proxyFile, secsLeft) if secsLeft <= 86400 * 3: # 3 days proxyWarning = True agInfo['status'] = "error" elif secsLeft <= 86400 * 5: # 5 days proxyWarning = True if agInfo['status'] == "ok": agInfo['status'] = "warning" else: proxyWarning = False if proxyWarning: warnMsg = "Agent proxy '%s' must be renewed ASAP. " % self.proxyFile warnMsg += "Its time left is: %.2f hours." % (secsLeft / 3600.) agInfo['proxy_warning'] = warnMsg return
class AgentStatusPoller(BaseWorkerThread): """ Gether the summary data for request (workflow) from local queue, local job couchdb, wmbs/boss air and populate summary db for monitoring """ def __init__(self, config): """ initialize properties specified from config """ BaseWorkerThread.__init__(self) # set the workqueue service for REST call self.config = config # need to get campaign, user, owner info self.agentInfo = initAgentInfo(self.config) self.summaryLevel = config.AnalyticsDataCollector.summaryLevel self.jsonFile = config.AgentStatusWatcher.jsonFile def setUpCouchDBReplication(self): self.replicatorDocs = [] # set up common replication code wmstatsSource = self.config.JobStateMachine.jobSummaryDBName wmstatsTarget = self.config.AnalyticsDataCollector.centralWMStatsURL self.replicatorDocs.append({'source': wmstatsSource, 'target': wmstatsTarget, 'filter': "WMStatsAgent/repfilter"}) #TODO: tier0 specific code - need to make it generic if hasattr(self.config, "Tier0Feeder"): t0Source = self.config.Tier0Feeder.requestDBName t0Target = self.config.AnalyticsDataCollector.centralRequestDBURL self.replicatorDocs.append({'source': t0Source, 'target': t0Target, 'filter': "T0Request/repfilter"}) else: # set up workqueue replication wqfilter = 'WorkQueue/queueFilter' parentQURL = self.config.WorkQueueManager.queueParams["ParentQueueCouchUrl"] childURL = self.config.WorkQueueManager.queueParams["QueueURL"] query_params = {'childUrl' : childURL, 'parentUrl' : sanitizeURL(parentQURL)['url']} localQInboxURL = "%s_inbox" % self.config.AnalyticsDataCollector.localQueueURL self.replicatorDocs.append({'source': sanitizeURL(parentQURL)['url'], 'target': localQInboxURL, 'filter': wqfilter, 'query_params': query_params}) self.replicatorDocs.append({'source': sanitizeURL(localQInboxURL)['url'], 'target': parentQURL, 'filter': wqfilter, 'query_params': query_params}) # delete old replicator docs before setting up self.localCouchMonitor.deleteReplicatorDocs() for rp in self.replicatorDocs: self.localCouchMonitor.couchServer.replicate( rp['source'], rp['target'], filter = rp['filter'], query_params = rp.get('query_params', False), continuous = True) # First cicle need to be skipped since document is not updated that fast self.skipReplicationCheck = True def setup(self, parameters): """ set db connection(couchdb, wmbs) to prepare to gather information """ # interface to WMBS/BossAir db myThread = threading.currentThread() # set wmagent db data self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger) self.centralWMStatsCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.centralWMStatsURL) self.localCouchMonitor = CouchMonitor(self.config.JobStateMachine.couchurl) self.setUpCouchDBReplication() def algorithm(self, parameters): """ get information from wmbs, workqueue and local couch """ try: agentInfo = self.collectAgentInfo() #set the uploadTime - should be the same for all docs wmbsInfo = self.collectWMBSInfo() logging.info("finished collecting agent/wmbs info") agentInfo["WMBS_INFO"] = wmbsInfo uploadTime = int(time.time()) self.uploadAgentInfoToCentralWMStats(agentInfo, uploadTime) #save locally json file as well with open(self.jsonFile, 'w') as outFile: json.dump(agentInfo, outFile, indent=2) except Exception as ex: logging.error("Error occurred, will retry later:") logging.error(str(ex)) logging.error("Trace back: \n%s" % traceback.format_exc()) def collectCouchDBInfo(self): couchInfo = {'name': 'CouchServer', 'status': 'ok', 'error_message': ""} if self.skipReplicationCheck: # skipping the check this round set if False so it can be checked next round. self.skipReplicationCheck = False return couchInfo for rp in self.replicatorDocs: cInfo = self.localCouchMonitor.checkCouchServerStatus(rp['source'], rp['target'], checkUpdateSeq = False) if cInfo['status'] != 'ok': couchInfo['status'] = 'error' couchInfo['error_message'] = cInfo['error_message'] return couchInfo def collectAgentInfo(self): """ Monitors the general health of the agent, as: 1. status of the agent processes 2. status of the agent threads based on the database info 3. couchdb active tasks and its replications 4. check the disk usage 5. check the number of couch processes :return: a dict with all the info collected """ logging.info("Getting agent info ...") agentInfo = self.wmagentDB.getComponentStatus(self.config) agentInfo.update(self.agentInfo) if isDrainMode(self.config): logging.info("Agent is in DrainMode") agentInfo['drain_mode'] = True else: agentInfo['drain_mode'] = False couchInfo = self.collectCouchDBInfo() if couchInfo['status'] != 'ok': agentInfo['down_components'].append(couchInfo['name']) agentInfo['status'] = couchInfo['status'] agentInfo['down_component_detail'].append(couchInfo) # Disk space warning diskUseList = diskUse() diskUseThreshold = float(self.config.AnalyticsDataCollector.diskUseThreshold) agentInfo['disk_warning'] = [] for disk in diskUseList: if float(disk['percent'].strip('%')) >= diskUseThreshold and \ disk['mounted'] not in self.config.AnalyticsDataCollector.ignoreDisk: agentInfo['disk_warning'].append(disk) # Couch process warning couchProc = numberCouchProcess() logging.info("CouchDB is running with %d processes", couchProc) couchProcessThreshold = self.config.AnalyticsDataCollector.couchProcessThreshold if couchProc >= couchProcessThreshold: agentInfo['couch_process_warning'] = couchProc else: agentInfo['couch_process_warning'] = 0 # This adds the last time and message when data was updated to agentInfo lastDataUpload = DataUploadTime.getInfo() if lastDataUpload['data_last_update']: agentInfo['data_last_update'] = lastDataUpload['data_last_update'] if lastDataUpload['data_error']: agentInfo['data_error'] = lastDataUpload['data_error'] # Change status if there is data_error, couch process maxed out or disk full problems. if agentInfo['status'] == 'ok' and (agentInfo['drain_mode'] or agentInfo['disk_warning']): agentInfo['status'] = "warning" if agentInfo['status'] == 'ok' or agentInfo['status'] == 'warning': if agentInfo.get('data_error', 'ok') != 'ok' or agentInfo.get('couch_process_warning', 0): agentInfo['status'] = "error" if agentInfo['down_components']: logging.info("List of agent components down: %s" % agentInfo['down_components']) return agentInfo def uploadAgentInfoToCentralWMStats(self, agentInfo, uploadTime): #direct data upload to the remote to prevent data conflict when agent is cleaned up and redeployed agentDocs = convertToAgentCouchDoc(agentInfo, self.config.ACDC, uploadTime) self.centralWMStatsCouchDB.updateAgentInfo(agentDocs) def collectWMBSInfo(self): """ Fetches WMBS job information. In addition to WMBS, also collects RunJob info from BossAir :return: dict with the number of jobs in each status """ logging.info("Getting wmbs job info ...") results = {} start = int(time.time()) # first retrieve the site thresholds results['thresholds'] = self.wmagentDB.getJobSlotInfo() logging.debug("Running and pending site thresholds: %s", results['thresholds']) # now fetch the amount of jobs in each state and the amount of created # jobs grouped by task results.update(self.wmagentDB.getAgentMonitoring()) end = int(time.time()) #adding total query time results["total_query_time"] = end - start logging.debug("Total number of jobs in WMBS sorted by status: %s", results['wmbsCountByState']) logging.debug("Total number of 'created' jobs in WMBS sorted by type: %s", results['wmbsCreatedTypeCount']) logging.debug("Total number of 'executing' jobs in WMBS sorted by type: %s", results['wmbsExecutingTypeCount']) logging.debug("Total number of active jobs in BossAir sorted by status: %s", results['activeRunJobByStatus']) logging.debug("Total number of complete jobs in BossAir sorted by status: %s", results['completeRunJobByStatus']) logging.debug("Available slots thresholds to pull work from GQ to LQ: %s", results['thresholdsGQ2LQ']) logging.debug("List of jobs pending for each site, sorted by priority: %s", results['sitePendCountByPrio']) return results