def run(self): server_url = '{proto}://{host}:{port}'.format(proto=self.protocol, host=self.host, port=self.port) try: log.debug('setting up Jenkins connection to %s', server_url) start_time = time.time() server = jenkins.Jenkins(server_url, username=self.user, password=self.password, timeout=self.timeout / 3) if log.isEnabledFor(logging.DEBUG): log.debug('getting user') user = server.get_whoami() log.debug('connected as user %s', jsonpp(user)) log.debug('getting plugin info') #plugins = server.get_plugins() # deprecated but .get_plugins() output is not JSON serializable # so must use old deprecated method get_plugins_info() :-/ plugins = server.get_plugins_info() query_time = time.time() - start_time except jenkins.JenkinsException as _: raise CriticalError(_) if log.isEnabledFor(logging.DEBUG): log.debug('%s', jsonpp(plugins)) plugin_count = len(plugins) update_count = 0 for plugin in plugins: if plugin['hasUpdate']: update_count += 1 self.msg += " {0} plugin update{1} available out of {2} installed plugin{3}".format(update_count, plural(update_count), plugin_count, plural(plugin_count)) if update_count: self.warning() self.msg += ' | updates_available={0};1 plugins_installed={1} query_time={2:.4f}s'.format(update_count, plugin_count, query_time)
def run(self): server_url = '{proto}://{host}:{port}'.format(proto=self.protocol, host=self.host, port=self.port) try: log.debug('setting up Jenkins connection to %s', server_url) start_time = time.time() server = jenkins.Jenkins(server_url, username=self.user, password=self.password, timeout=self.timeout / 3) if log.isEnabledFor(logging.DEBUG): log.debug('getting user') user = server.get_whoami() log.debug('connected as user %s', jsonpp(user)) log.debug('fetching running builds') running_builds = server.get_running_builds() if log.isEnabledFor(logging.DEBUG): log.debug('%s', jsonpp(running_builds)) running_build_count = len(running_builds) log.debug('running build count: %s', running_build_count) self.msg += '{0}'.format(running_build_count) self.check_thresholds(running_build_count) except jenkins.JenkinsException as _: raise CriticalError(_) query_time = time.time() - start_time self.msg += ' | running_build_count={0:d}'.format(running_build_count) self.msg += self.get_perf_thresholds() self.msg += ' query_time={0:.4f}s'.format(query_time)
def run(self): server_url = '{proto}://{host}:{port}'.format(proto=self.protocol, host=self.host, port=self.port) try: log.debug('setting up Jenkins connection to %s', server_url) start_time = time.time() server = jenkins.Jenkins(server_url, username=self.user, password=self.password, timeout=self.timeout / 3) if log.isEnabledFor(logging.DEBUG): log.debug('getting user') user = server.get_whoami() log.debug('connected as user %s', jsonpp(user)) if self.list_views: log.debug('getting views') views = server.get_views() if log.isEnabledFor(logging.DEBUG): log.debug("%s", jsonpp(views)) print('Jenkins views:\n') for view in views: print(view['name']) sys.exit(ERRORS['UNKNOWN']) if self.view: log.debug('checking view exists') #assert server.view_exists(self.view) server.assert_view_exists(self.view) log.debug('getting jobs for view %s', self.view) view_jobs = server.get_jobs(view_name=self.view) if log.isEnabledFor(logging.DEBUG): log.debug("%s", jsonpp(view_jobs)) job_count = len(view_jobs) else: log.debug('getting job count') job_count = server.jobs_count() # more efficient with many folders # job_count = server.run_script( # "print(Hudson.instance.getAllItems(" # " hudson.model.AbstractProject).count{" # " !(it instanceof hudson.matrix.MatrixConfiguration)" # " })") query_time = time.time() - start_time log.debug('job count: %s', job_count) if self.view: self.msg += "for view '{0}' ".format(self.view) self.msg += '= {0}'.format(job_count) self.check_thresholds(job_count) except jenkins.JenkinsException as _: raise CriticalError(_) self.msg += ' | job_count={0:d}'.format(job_count) self.msg += self.get_perf_thresholds() self.msg += ' query_time={0:.4f}s'.format(query_time)
def check(self, client): log.info('running Docker info') info = client.info() if log.isEnabledFor(logging.DEBUG): log.debug(jsonpp(info)) containers = info['Containers'] running_containers = info['ContainersRunning'] paused_containers = info['ContainersPaused'] stopped_containers = info['ContainersStopped'] self.msg = 'Docker ' if self.running: self.msg += 'running containers = {}'.format(running_containers) self.check_thresholds(running_containers) self.msg += ' | running_containers={}{}'.format(running_containers, self.get_perf_thresholds()) elif self.paused: self.msg += 'paused containers = {}'.format(paused_containers) self.check_thresholds(paused_containers) self.msg += ' | paused_containers={}{}'.format(paused_containers, self.get_perf_thresholds()) elif self.stopped: self.msg += 'stopped containers = {}'.format(stopped_containers) self.check_thresholds(stopped_containers) self.msg += ' | stopped_containers={}{}'.format(stopped_containers, self.get_perf_thresholds()) elif self.total: self.msg += 'total containers = {}'.format(containers) self.check_thresholds(containers) self.msg += ' | total_containers={}{}'.format(containers, self.get_perf_thresholds()) else: self.msg += 'containers = {}, running containers = {}, paused containers = {}, stopped containers = {}'\ .format(containers, running_containers, paused_containers, stopped_containers) self.msg += ' | containers={} running_containers={} paused_containers={} stopped_containers={}'\ .format(containers, running_containers, paused_containers, stopped_containers)
def run(self): server_url = '{proto}://{host}:{port}'.format(proto=self.protocol, host=self.host, port=self.port) try: log.debug('setting up Jenkins connection to %s', server_url) start_time = time.time() server = jenkins.Jenkins(server_url, username=self.user, password=self.password, timeout=self.timeout / 3) if log.isEnabledFor(logging.DEBUG): log.debug('getting user') user = server.get_whoami() log.debug('connected as user %s', jsonpp(user)) log.debug('getting Jenkins nodes') nodes = server.get_nodes() log.debug('nodes: %s', nodes) node_count = len(nodes) log.debug('node count: %s', node_count) offline_nodes = 0 for node in nodes: if node['offline']: offline_nodes += 1 self.msg += '{0} offline node{1}'.format(offline_nodes, plural(offline_nodes)) self.check_thresholds(offline_nodes) self.msg += ' out of {0} node{1}'.format(node_count, plural(node_count)) except jenkins.JenkinsException as _: raise CriticalError(_) query_time = time.time() - start_time self.msg += ' | offline_nodes={0:d}'.format(offline_nodes) self.msg += self.get_perf_thresholds() self.msg += ' node_count={0:d}'.format(node_count) self.msg += ' query_time={0:.4f}s'.format(query_time)
def req(self, url, method='post', body=None): assert isStr(method) log.debug('%s %s', method.upper(), url) headers = {"Content-Type": "application/json", "Accept": "application/json", "JSESSIONID": self.jsessionid} log.debug('headers: %s', headers) start_time = time.time() try: req = getattr(requests, method.lower())(url, #cookies=self.jar, data=body, headers=headers) for cookie_tuple in req.cookies.items(): if cookie_tuple[0] == 'JSESSIONID': self.jsessionid = cookie_tuple[1].rstrip('/') timing = time.time() - start_time except requests.exceptions.RequestException as _: qquit('CRITICAL', _) if log.isEnabledFor(logging.DEBUG): log.debug("response: %s %s", req.status_code, req.reason) content = req.content try: content = jsonpp(req.content).strip() except ValueError: pass log.debug("content:\n%s\n%s\n%s", '='*80, content, '='*80) if req.status_code != 200: info = '' try: info = ': {0}'.format(json.loads(req.content)['result']) except (KeyError, ValueError): pass qquit('CRITICAL', "%s %s%s" % (req.status_code, req.reason, info)) return (req, timing)
def search(term, limit=25): url = 'https://index.docker.io/v1/search?q={0}&n={1}'.format(urllib.quote_plus(term), limit) log.debug('GET %s' % url) try: verify = True # workaround for Travis CI and older pythons - we're not exchanging secret data so this is ok #if os.getenv('TRAVIS'): # verify = False req = requests.get(url, verify=verify) except requests.exceptions.RequestException as _: die(_) log.debug("response: %s %s", req.status_code, req.reason) log.debug("content:\n%s\n%s\n%s", '='*80, req.content.strip(), '='*80) if req.status_code != 200: die("%s %s" % (req.status_code, req.reason)) if not isJson(req.content): die('invalid non-JSON response from DockerHub!') if log.isEnabledFor(logging.DEBUG): print(jsonpp(req.content)) print('='*80) try: data = json.loads(req.content) except KeyError as _: die('failed to parse output from DockerHub (format may have changed?): {0}'.format(_)) return data
def query(url): log.debug('GET %s' % url) try: verify = True # workaround for Travis CI and older pythons - we're not exchanging secret data so this is ok #if os.getenv('TRAVIS'): # verify = False req = requests.get(url, verify=verify) except requests.exceptions.RequestException as _: die(_) log.debug("response: %s %s", req.status_code, req.reason) log.debug("content:\n%s\n%s\n%s", '='*80, req.content.strip(), '='*80) if req.status_code != 200: die("%s %s" % (req.status_code, req.reason)) if not isJson(req.content): die('invalid non-JSON response from DockerHub!') if log.isEnabledFor(logging.DEBUG): print(jsonpp(req.content)) print('='*80) tag_list = [] try: j = json.loads(req.content) tag_list = [_['name'] for _ in j['results']] # could perhaps stack overflow in some scenario # not as functional programming 'cool' but will do own tail recursion and just while loop instead #if 'next' in j and j['next']: # tag_list += self.query(j['next']) return (tag_list, j['next']) except KeyError as _: die('failed to parse output from DockerHub (format may have changed?): {0}'.format(_))
def run(self): server_url = '{proto}://{host}:{port}'.format(proto=self.protocol, host=self.host, port=self.port) try: log.debug('setting up Jenkins connection to %s', server_url) start_time = time.time() server = jenkins.Jenkins(server_url, username=self.user, password=self.password, timeout=self.timeout / 3) if log.isEnabledFor(logging.DEBUG): log.debug('getting user') user = server.get_whoami() log.debug('connected as user %s', jsonpp(user)) log.debug('getting plugin info') #plugins = server.get_plugins() # deprecated but .get_plugins() output is not JSON serializable # so must use old deprecated method get_plugins_info() :-/ plugins = server.get_plugins_info() query_time = time.time() - start_time except jenkins.JenkinsException as _: raise CriticalError(_) if log.isEnabledFor(logging.DEBUG): log.debug('%s', jsonpp(plugins)) if self.list_plugins: plugin_list = [] print('Jenkins plugins:\n') for _ in plugins: plugin_list.append(_['longName']) for _ in sorted(plugin_list, key=lambda s: s.lower()): print(_) sys.exit(ERRORS['UNKNOWN']) plugin = None for _ in plugins: if _['longName'].lower() == self.plugin.lower(): plugin = _ break if not plugin: raise CriticalError("plugin '{0}' not found. Try --list to see installed plugins".format(self.plugin)) longname = plugin['longName'] enabled = plugin['enabled'] active = plugin['active'] has_update = plugin['hasUpdate'] self.msg += " plugin '{0}' enabled: {1}, active: {2}".format(longname, enabled, active) if not enabled or not active: self.critical() self.msg += ', update available: {0}'.format(has_update) if self.check_update and has_update: self.warning() self.msg += ' | query_time={0:.4f}s'.format(query_time)
def run(self): server_url = '{proto}://{host}:{port}'.format(proto=self.protocol, host=self.host, port=self.port) try: log.debug('setting up Jenkins connection to %s', server_url) start_time = time.time() server = jenkins.Jenkins(server_url, username=self.user, password=self.password, timeout=self.timeout / 3) if log.isEnabledFor(logging.DEBUG): log.debug('getting user') user = server.get_whoami() log.debug('connected as user %s', jsonpp(user)) if self.list_nodes: log.debug('getting Jenkins nodes') nodes = server.get_nodes() log.debug('nodes: %s', nodes) print('Jenkins nodes:\n') for _ in nodes: print(_['name']) sys.exit(ERRORS['UNKNOWN']) # doesn't find 'master' node despite showing it in the list of nodes, jenkins puts brackets around master if self.node == 'master': self.node = '(master)' node = server.get_node_info(self.node) except jenkins.NotFoundException: raise CriticalError("node '{0}' not found, did you specify the correct name? See --list to see nodes"\ .format(self.node)) except jenkins.JenkinsException as _: raise CriticalError(_) query_time = time.time() - start_time if log.isEnabledFor(logging.DEBUG): log.debug('%s', jsonpp(node)) offline = node['offline'] offline_reason = node['offlineCauseReason'] num_executors = node['numExecutors'] num_executors = int(num_executors) if not isInt(num_executors): raise UnknownError('numExecutors returned non-integer! {0}'.format(support_msg_api())) if offline: self.critical() self.msg += 'offline: {0}'.format(offline_reason) else: self.msg += 'online' self.msg += ', num executors = {0}'.format(num_executors) self.check_thresholds(num_executors) self.msg += ' | num_executors={0:d}'.format(num_executors) self.msg += self.get_perf_thresholds(boundary='lower') self.msg += ' query_time={0:.4f}s'.format(query_time)
def run(self): log.info("querying %s", self.software) url = "{protocol}://{host}:{port}/PolicyManagement/{api_version}/deployments".format( host=self.host, port=self.port, api_version=self.api_version, protocol=self.protocol ) log.debug("GET %s", url) try: req = requests.get(url, auth=HTTPBasicAuth(self.user, self.password)) except requests.exceptions.RequestException as _: errhint = "" if "BadStatusLine" in str(_.message): errhint = " (possibly connecting to an SSL secured port without using --ssl?)" elif self.protocol == "https" and "unknown protocol" in str(_.message): errhint = " (possibly connecting to a plain HTTP port with the -S / --ssl switch enabled?)" qquit("CRITICAL", str(_) + errhint) log.debug("response: %s %s", req.status_code, req.reason) log.debug("content:\n%s\n%s\n%s", "=" * 80, req.content.strip(), "=" * 80) if req.status_code == 400 and req.reason == "Bad Request": qquit( "CRITICAL", "{0}: {1} (possibly new install with no deployments yet?)".format(req.status_code, req.reason), ) if req.status_code != 200: qquit("CRITICAL", "{0}: {1}".format(req.status_code, req.reason)) try: json_list = json.loads(req.content) if log.isEnabledFor(logging.DEBUG): print(jsonpp(json_list)) print("=" * 80) if not isList(json_list): raise ValueError("returned content is not a list") if not json_list: qquit("UNKNOWN", "no deployments found") last_deployment = json_list[0] userid = last_deployment["UserId"] description = last_deployment["Description"] hostname = last_deployment["HostName"] timestamp = last_deployment["timestamp"] last_deploy_datetime = datetime.strptime(timestamp, "%b %d, %Y %H:%M:%S %p") except (KeyError, ValueError) as _: qquit( "UNKNOWN", "error parsing output from {software}: {exception}: {error}. {support_msg}".format( software=self.software, exception=type(_).__name__, error=_, support_msg=support_msg_api() ), ) timedelta = datetime.now() - last_deploy_datetime mins = int(int(timedelta.total_seconds()) / 60) self.msg = "{software} last deployment was at '{timestamp}', {mins} mins ago".format( software=self.software, timestamp=timestamp, mins=mins ) self.check_thresholds(mins) if self.verbose: self.msg += " by user '{userid}', host = '{hostname}', description = '{description}'".format( userid=userid, hostname=hostname, description=description ) self.msg += " | mins_since_last_deployment={mins}{thresholds}".format( mins=mins, thresholds=self.get_perf_thresholds(boundary="lower") )
def run(self): server_url = '{proto}://{host}:{port}'.format(proto=self.protocol, host=self.host, port=self.port) try: log.debug('setting up Jenkins connection to %s', server_url) start_time = time.time() server = jenkins.Jenkins(server_url, username=self.user, password=self.password, timeout=self.timeout / 3) if log.isEnabledFor(logging.DEBUG): log.debug('getting user') user = server.get_whoami() log.debug('connected as user %s', jsonpp(user)) if self.list_jobs: log.debug('getting jobs') #jobs = server.get_jobs() # recursively get all jobs jobs = server.get_all_jobs() # more efficient with many folders # jobs = server.run_script(""" # import groovy.json.JsonBuilder; # # // get all projects excluding matrix configuration # // as they are simply part of a matrix project. # // there may be better ways to get just jobs # items = Jenkins.instance.getAllItems(AbstractProject); # items.removeAll { # it instanceof hudson.matrix.MatrixConfiguration # }; # # def json = new JsonBuilder() # def root = json { # jobs items.collect { # [ # name: it.name, # url: Jenkins.instance.getRootUrl() + it.getUrl(), # color: it.getIconColor().toString(), # fullname: it.getFullName() # ] # } # } # # // use json.toPrettyString() if viewing # println json.toString() # """) print('Jenkins Jobs:\n') for job in jobs: print(job['fullname']) sys.exit(ERRORS['UNKNOWN']) log.debug('checking job exists') if server.job_exists(self.job): self.msg += 'exists' else: self.critical() self.msg += 'does not exist!' except jenkins.JenkinsException as _: raise CriticalError(_) query_time = time.time() - start_time self.msg += ' | query_time={0:.4f}s'.format(query_time)
def process_json(self, content): try: self.json_data = json.loads(content) if log.isEnabledFor(logging.DEBUG): log.debug('JSON prettified:\n\n%s\n%s', jsonpp(self.json_data), '='*80) return self.parse_json(self.json_data) except (KeyError, ValueError) as _: #raise UnknownError('{0}: {1}. {2}'.format(type(_).__name__, _, support_msg_api())) raise UnknownError('{0}. {1}'.format(self.exception_msg(), support_msg_api()))
def check(self, client): log.info('running Docker info') info = client.info() if log.isEnabledFor(logging.DEBUG): log.debug(jsonpp(info)) images = info['Images'] self.msg = 'Docker images = {}'.format(images) self.check_thresholds(images) self.msg += ' | docker_images={}{}'.format(images, self.get_perf_thresholds())
def parse(content): try: _ = json.loads(content) if log.isEnabledFor(logging.DEBUG): log.debug(jsonpp(_)) compaction_queue_size = None for bean in _['beans']: if bean['name'] == 'Hadoop:service=HBase,name=RegionServer,sub=Server': if log.isEnabledFor(logging.DEBUG): log.debug('found RegionServer section:') log.debug(jsonpp(bean)) compaction_queue_size = bean['compactionQueueLength'] if not isInt(compaction_queue_size): qquit('UNKNOWN', 'non-integer returned for compactionQueueLength! ' + support_msg_api()) return compaction_queue_size except ValueError as _: qquit('UNKNOWN', _ + ': failed to parse HBase Master jmx info. ' + support_msg_api()) qquit('UNKNOWN', 'RegionServer mbean not found, double check this is pointing to an HBase RegionServer')
def check(self, client): log.info('running Docker info') swarm = client.swarm if log.isEnabledFor(logging.DEBUG): log.debug(jsonpp(swarm.attrs)) if swarm.attrs: self.msg = 'Docker Swarm enabled' else: self.msg = 'Docker Swarm not enabled' self.critical()
def list(self, url_suffix): self.url = self.url_base + '/' + url_suffix try: response = self.get(url_suffix) except requests.exceptions.RequestException as _: err = 'failed to fetch list of Ambari Blueprints: %s' % _ # log.critical(err) qquit('CRITICAL', err) json_data = json.loads(response) if log.isEnabledFor(logging.DEBUG): log.debug("json_data = " + jsonpp(json_data)) return json_data
def run(self): content = self.get() try: json_dict = json.loads(content) if log.isEnabledFor(logging.DEBUG): print(jsonpp(json_dict)) print('=' * 80) if not isDict(json_dict): raise ValueError('returned content is not a dict') status = json_dict['status'] if status != 'success': qquit( 'CRITICAL', "request status = '{0}' (expected 'success')".format( status)) status_code = json_dict['statusCode'] if status_code != 200: qquit( 'CRITICAL', "request status code = '{0}' (expected '200')".format( status_code)) message = json_dict['message'] data = json_dict['data'] if not data: num_endpoints = 0 elif not isList(data): qquit('CRITICAL', 'non-list returned for policy end points data') else: num_endpoints = len(data) match = re.match( message, r'Total [(\d+)] policy engine end point\(s\) found', re.I) if not match: raise ValueError( 'failed to parse message for confirmation of number of endpoints' ) message_num_endpoints = int(match.group(1)) if num_endpoints != message_num_endpoints: raise ValueError( 'num endpoints does not match parsed value from returned message' ) except (KeyError, ValueError) as _: qquit('UNKNOWN', 'error parsing output from {software}: {exception}: {error}. {support_msg}'\ .format(software=self.software, exception=type(_).__name__, error=_, support_msg=support_msg_api())) self.msg = "{software} number of policy end points = {num_endpoints}"\ .format(software=self.software, num_endpoints=num_endpoints) self.check_thresholds(num_endpoints) self.msg += ' | num_endpoints={num_endpoints}'.format( num_endpoints=num_endpoints) + self.get_perf_thresholds()
def check(self, client): # containers = client.containers.list() # print(containers) try: container = client.containers.get(self.container) except docker.errors.APIError as _: raise CriticalError(_) if log.isEnabledFor(logging.DEBUG): log.debug(jsonpp(container.attrs)) #print(jsonpp(container.stats(stream=False))) state = container.attrs['State'] status = state['Status'] self.msg = "Docker container '{}' status = '{}'".format( self.container, status) if status in ('paused', 'restarting'): self.warning() elif status != 'running': self.critical() dead = state['Dead'] exitcode = state['ExitCode'] error = state['Error'] oom = state['OOMKilled'] restarting = state['Restarting'] paused = state['Paused'] started = state['StartedAt'] finished = state['FinishedAt'] if paused and status != 'paused': self.msg += ", paused = '{}'".format(paused) self.warning() if restarting and status != 'restarting': self.msg += ", restarting = '{}'".format(restarting) self.warning() if dead: self.msg += ", dead = '{}'!".format(dead) self.critical() if exitcode: self.msg += ", exit code = '{}'".format(exitcode) self.critical() if error: self.msg += ", error = '{}'".format(error) self.critical() if oom: self.msg += ", OOMKilled = '{}'".format(oom) self.critical() self.msg += ", started at '{}'".format(started) if self.verbose: human_time = self.calculate_human_age(started) self.msg += ' ({} ago)'.format(human_time) if finished != '0001-01-01T00:00:00Z': self.msg += ", finished at '{}'".format(finished) if self.verbose: human_time = self.calculate_human_age(finished) self.msg += ' ({} ago)'.format(human_time)
def parse_results(self, content): try: json_dict = json.loads(content) if log.isEnabledFor(logging.DEBUG): print(jsonpp(content)) print('=' * 80) # looks like syshealthok child div is only there in browser, but give syshealthspin in code #if soup.find('div', id='syshealthstatus').find('div', id='syshealthok'): if not isDict(json_dict): raise ValueError("non-dict returned by Attivio AIE server response (type was '{0}')"\ .format(type(json_dict))) # if this is true from warnings would ruin the more appropriate warnings check #if json_dict['haserrors']: # self.critical() # self.msg += 'errors detected, ' nodes_down = json_dict['nodesdown'] warnings = json_dict['warnings'] fatals = json_dict['fatals'] acknowledged = json_dict['acknowledged'] if not isInt(nodes_down): raise ValueError( 'non-integer returned for nodes down count by Attivio AIE') if not isInt(warnings): raise ValueError( 'non-integer returned for warnings count by Attivio AIE') if not isInt(fatals): raise ValueError( 'non-integer returned for fatals count by Attivio AIE') if not isInt(acknowledged): raise ValueError( 'non-integer returned for acknowledged count by Attivio AIE' ) nodes_down = int(nodes_down) warnings = int(warnings) fatals = int(fatals) acknowledged = int(acknowledged) if nodes_down > 0 or fatals > 0: self.critical() elif warnings > 0: self.warning() self.msg += '{nodes_down} nodes down, {fatals} fatals, {warnings} warnings, {acknowledged} acknowledged'\ .format(nodes_down=nodes_down, fatals=fatals, warnings=warnings, acknowledged=acknowledged) if json_dict['perfmondown']: self.warning() self.msg += ', warning: performance monitoring down' self.msg += ' | nodes_down={nodes_down} fatals={fatals} warnings={warnings} acknowledged={acknowledged}'\ .format(nodes_down=nodes_down, fatals=fatals, warnings=warnings, acknowledged=acknowledged) except (KeyError, ValueError) as _: qquit('UNKNOWN', 'error parsing output from {software}: {exception}: {error}. {support_msg}'\ .format(software=self.software, exception=type(_).__name__, error=_, support_msg=support_msg_api()))
def check(self, client): log.info('running Docker info') info = client.info() if log.isEnabledFor(logging.DEBUG): log.debug(jsonpp(info)) swarm = info['Swarm'] if 'Cluster' not in swarm: raise CriticalError('Docker is not a member of a Swarm') state = swarm['LocalNodeState'] self.msg = 'Docker Swarm node state = {}'.format(state) if state != 'active': self.critical()
def process_json(self, content): try: self.json_data = json.loads(content) if log.isEnabledFor(logging.DEBUG): log.debug('JSON prettified:\n\n%s\n%s', jsonpp(self.json_data), '=' * 80) return self.parse_json(self.json_data) #except (KeyError, ValueError) as _: #raise UnknownError('{0}: {1}. {2}'.format(type(_).__name__, _, support_msg_api())) except (KeyError, ValueError): raise UnknownError('{0}. {1}'.format(self.exception_msg(), support_msg_api()))
def check(self, client): # containers = client.containers.list() # print(containers) try: container = client.containers.get(self.container) except docker.errors.APIError as _: raise CriticalError(_) if log.isEnabledFor(logging.DEBUG): log.debug(jsonpp(container.attrs)) #print(jsonpp(container.stats(stream=False))) state = container.attrs['State'] status = state['Status'] self.msg = "Docker container '{}' status = '{}'".format(self.container, status) if status in ('paused', 'restarting'): self.warning() elif status != 'running': self.critical() dead = state['Dead'] exitcode = state['ExitCode'] error = state['Error'] oom = state['OOMKilled'] restarting = state['Restarting'] paused = state['Paused'] started = state['StartedAt'] finished = state['FinishedAt'] if paused and status != 'paused': self.msg += ", paused = '{}'".format(paused) self.warning() if restarting and status != 'restarting': self.msg += ", restarting = '{}'".format(restarting) self.warning() if dead: self.msg += ", dead = '{}'!".format(dead) self.critical() if exitcode: self.msg += ", exit code = '{}'".format(exitcode) self.critical() if error: self.msg += ", error = '{}'".format(error) self.critical() if oom: self.msg += ", OOMKilled = '{}'".format(oom) self.critical() self.msg += ", started at '{}'".format(started) if self.verbose: human_time = self.calculate_human_age(started) self.msg += ' ({} ago)'.format(human_time) if finished != '0001-01-01T00:00:00Z': self.msg += ", finished at '{}'".format(finished) if self.verbose: human_time = self.calculate_human_age(finished) self.msg += ' ({} ago)'.format(human_time)
def gen_payload(self, services=None): log.debug('generating payload for services: %s', services) if services is None or services == 'all': services = self.get_services() if not isList(services): code_error('non-list passed to gen_payload') # determined from here: # https://community.hortonworks.com/questions/11111/is-there-a-way-to-execute-ambari-service-checks-in.html payload = [ { "RequestSchedule": { "batch": [ { "requests": [] }, { "batch_settings": { "batch_separation_in_seconds": 1, "task_failure_tolerance": 1 } } ] } } ] service_count = len(services) for index in range(service_count): service = services[index] index += 1 payload[0]['RequestSchedule']['batch'][0]['requests'].append( { "order_id": index, "type": "POST", "uri": "/api/v1/clusters/{0}/requests".format(self.cluster), "RequestBodyInfo":{ "RequestInfo": { "command": "{service}_SERVICE_CHECK".format(service=service.upper()), "context": "{service} Service Check (batch {index} of {total})". format(service=service, index=index, total=service_count) }, "Requests/resource_filters":[ { "service_name": service.upper() } ] } } ) payload_str = json.dumps(payload) if log.isEnabledFor(logging.DEBUG): log.debug('generated payload:\n%s', jsonpp(payload_str)) return payload_str
def gen_payload(self, services=None): log.debug('generating payload for services: %s', services) if services is None or services == 'all': services = self.get_services() if not isList(services): code_error('non-list passed to gen_payload') # determined from here: # https://community.hortonworks.com/questions/11111/is-there-a-way-to-execute-ambari-service-checks-in.html payload = [{ "RequestSchedule": { "batch": [{ "requests": [] }, { "batch_settings": { "batch_separation_in_seconds": 1, "task_failure_tolerance": 1 } }] } }] service_count = len(services) for index in range(service_count): service = services[index] index += 1 payload[0]['RequestSchedule']['batch'][0]['requests'].append({ "order_id": index, "type": "POST", "uri": "/api/v1/clusters/{0}/requests".format(self.cluster), "RequestBodyInfo": { "RequestInfo": { "command": "{service}_SERVICE_CHECK".format( service=service.upper()), "context": "{service} Service Check (batch {index} of {total})". format(service=service, index=index, total=service_count) }, "Requests/resource_filters": [{ "service_name": service.upper() }] } }) payload_str = json.dumps(payload) if log.isEnabledFor(logging.DEBUG): log.debug('generated payload:\n%s', jsonpp(payload_str)) return payload_str
def check_table(self): log.info('checking table \'%s\'', self.table) is_enabled = None families = None try: is_enabled = self.conn.is_table_enabled(self.table) log.info('enabled: %s', is_enabled) table = self.conn.table(self.table) families = table.families() except HBaseIOError as _: #if 'org.apache.hadoop.hbase.TableNotFoundException' in _.message: if 'TableNotFoundException' in _.message: qquit('CRITICAL', 'table \'{0}\' does not exist'.format(self.table)) else: qquit('CRITICAL', _) except (socket.timeout, ThriftException) as _: qquit('CRITICAL', _) if log.isEnabledFor(logging.DEBUG): log.debug('column families:\n' + jsonpp(families)) if not families: qquit( 'CRITICAL', 'failed to get column families for table \'{0}\''.format( self.table)) if not isDict(families): qquit( 'UNKNOWN', 'column family info returned was not a dictionary! ' + support_msg_api()) num_families = len(families) log.info('num families: %s', num_families) self.msg = 'HBase table \'{0}\' is '.format(self.table) if is_enabled: self.msg += 'enabled, ' else: self.critical() self.msg += 'disabled! ' self.msg += '{0} column '.format(num_families) if num_families == 1: self.msg += 'family' else: self.msg += 'families' self.check_thresholds(num_families) self.msg += ' | num_column_families={0}'.format( num_families) + self.get_perf_thresholds(boundary='lower') log.info('finished, closing connection') self.conn.close()
def get_latest_builds(self): log.info('getting latest builds') # gets 404 unless replacing the slash url = 'https://api.travis-ci.org/repo/{repo}/builds'.format( repo=self.repo.replace('/', '%2F')) # request returns blank without authorization header req = self.request_handler.get(url, headers=self.headers) if log.isEnabledFor(logging.DEBUG): log.debug("\n%s", jsonpp(req.content)) if not isJson(req.content): raise UnknownError('non-json returned by Travis CI. {0}'.format( support_msg_api())) return req.content
def get_version(self): url = 'http://{host}:{port}/{path}'.format(host=self.host, port=self.port, path=self.url_path) req = RequestHandler().get(url, auth=(self.user, self.password)) try: json_data = json.loads(req.content) if log.isEnabledFor(logging.DEBUG): print(jsonpp(json_data)) print('=' * 80) return json_data['rabbitmq_version'] except (KeyError, ValueError) as _: qquit('UNKNOWN', str(_) + support_msg_api())
def run(self): self.no_args() host = self.get_opt('host') port = self.get_opt('port') slave = self.get_opt('slave') list_slaves = self.get_opt('list_slaves') validate_host(host) validate_port(port) if not list_slaves: validate_host(slave, 'slave') url = 'http://%(host)s:%(port)s/master/slaves' % locals() log.debug('GET %s', url) try: req = requests.get(url) except requests.exceptions.RequestException as _: qquit('CRITICAL', _) log.debug("response: %s %s", req.status_code, req.reason) log.debug("content:\n{0}\n{1}\n{2}".format('='*80, req.content.strip(), '='*80)) if req.status_code != 200: if req.status_code == 404: qquit('CRITICAL', '%s %s (did you point this at the correct Mesos Master?)' % (req.status_code, req.reason)) qquit('CRITICAL', "Non-200 response! %s %s" % (req.status_code, req.reason)) content = req.content if not isJson(content): qquit('UNKNOWN', 'invalid JSON returned by Mesos Master') data = json.loads(content) if log.isEnabledFor(logging.DEBUG): log.debug('\n%s', jsonpp(data)) slaves = {} regex = re.compile(r'^slave\(\d+\)\@(.+):\d+') try: for item in data['slaves']: match = regex.match(item['pid']) if match: slaves[item['hostname']] = match.group(1) else: slaves[item['hostname']] = item['pid'] except KeyError: qquit('UNKNOWN', 'failed to parse slaves from Mesos API output. {0}'.format(support_msg_api)) if list_slaves: qquit('UNKNOWN', 'Slaves list:\n\n{0}'.format(dict_lines(slaves))) log.info('found slaves:\n\n{0}\n'.format(dict_lines(slaves))) slave = slave.lower() for _ in slaves: if slave == _.lower() or slave == slaves[_].lower(): qquit('OK', "Mesos slave '{0}' registered with master".format(slave)) break else: qquit('CRITICAL', "Mesos slave '{0}' not registered with master".format(slave))
def query(self, url): log.debug('GET %s' % url) try: verify = True # workaround for Travis CI and older pythons - we're not exchanging secret data so this is ok #if os.getenv('TRAVIS'): # verify = False if os.getenv('SSL_NOVERIFY') == '1': log.warn('disabling SSL verification') verify = False auth = None if self.user and self.password: auth = (self.user, self.password) log.debug( 'setting basic HTTP authenication using username: %s, password: <omitted>', self.user) req = requests.get(url, auth=auth, verify=verify) except requests.exceptions.RequestException as _: die(_) log.debug("response: %s %s", req.status_code, req.reason) log.debug("content:\n%s\n%s\n%s", '=' * 80, req.content.strip(), '=' * 80) if req.status_code != 200: die("%s %s" % (req.status_code, req.reason)) if not isJson(req.content): die('invalid non-JSON response from Docker Registry!') if log.isEnabledFor(logging.DEBUG): print(jsonpp(req.content)) print('=' * 80) tag_list = [] try: json_data = json.loads(req.content) # DockerHub returns like this if 'results' in json_data: tag_list = [result['name'] for result in json_data['results']] # Docker Registry returns like this elif 'tags' in json_data: tag_list = json_data['tags'] else: raise UnknownError('failed to parse response, found neither results nor tags fields. {0}'\ .format(support_msg_api())) # could perhaps stack overflow in some scenario # not as functional programming 'cool' but will do own tail recursion and just while loop instead next_page_url = None if 'next' in json_data and json_data['next']: # tag_list += self.query(json_data['next']) next_page_url = json_data['next'] return (tag_list, next_page_url) except KeyError as _: die('failed to parse output from Docker Registry (format may have changed?): {0}' .format(_))
def run(self): log.info('querying %s', self.software) url = '{protocol}://{host}:{port}/PolicyManagement/{api_version}/resources'\ .format(host=self.host, port=self.port, api_version=self.api_version, protocol=self.protocol) log.debug('GET %s', url) try: req = requests.get(url, auth=HTTPBasicAuth(self.user, self.password)) except requests.exceptions.RequestException as _: errhint = '' if 'BadStatusLine' in str(_.message): errhint = ' (possibly connecting to an SSL secured port without using --ssl?)' elif self.protocol == 'https' and 'unknown protocol' in str(_.message): errhint = ' (possibly connecting to a plain HTTP port with the -S / --ssl switch enabled?)' qquit('CRITICAL', str(_) + errhint) log.debug("response: %s %s", req.status_code, req.reason) log.debug("content:\n%s\n%s\n%s", '='*80, req.content.strip(), '='*80) if req.status_code != 200: qquit('CRITICAL', '{0}: {1}'.format(req.status_code, req.reason)) try: json_dict = json.loads(req.content) if log.isEnabledFor(logging.DEBUG): print(jsonpp(json_dict)) print('='*80) if not isDict(json_dict): raise ValueError("non-dict returned by Blue Talon API (got type '{0}')".format(type(json_dict))) resource_domains_list = json_dict['resource_domains'] if not isList(resource_domains_list): raise ValueError("non-list returned for 'resource_domains' key by Blue Talon API (got type '{0}')"\ .format(type(resource_domains_list))) num_resource_domains = len(resource_domains_list) num_resources = 0 for resource_domain in resource_domains_list: resources = resource_domain['resources'] if not isList(resources): raise ValueError("non-list found for resources in resource_domain '{0}' (got type '{1}'"\ .format(resource_domain['resource_domain_name'], type(resources))) num_resources += len(resources) self.msg += '{num_resources} resources'.format(num_resources=num_resources) self.check_thresholds(num_resources) self.msg += ' across {num_resource_domains} resource domains'\ .format(num_resource_domains=num_resource_domains) self.msg += ' | num_resources={num_resources}{perf} num_resource_domains={num_resource_domains}'\ .format(num_resources=num_resources, num_resource_domains=num_resource_domains, perf=self.get_perf_thresholds()) except (KeyError, ValueError) as _: qquit('UNKNOWN', 'error parsing output from {software}: {exception}: {error}. {support_msg}'\ .format(software=self.software, exception=type(_).__name__, error=_, support_msg=support_msg_api()))
def run(self): server_url = '{proto}://{host}:{port}'.format(proto=self.protocol, host=self.host, port=self.port) try: log.debug('setting up Jenkins connection to %s', server_url) start_time = time.time() server = jenkins.Jenkins(server_url, username=self.user, password=self.password, timeout=self.timeout / 3) if log.isEnabledFor(logging.DEBUG): log.debug('getting user') user = server.get_whoami() log.debug('connected as user %s', jsonpp(user)) log.debug('getting plugin info') #plugins = server.get_plugins() # deprecated but .get_plugins() output is not JSON serializable # so must use old deprecated method get_plugins_info() :-/ plugins = server.get_plugins_info() query_time = time.time() - start_time except jenkins.JenkinsException as _: raise CriticalError(_) if log.isEnabledFor(logging.DEBUG): log.debug('%s', jsonpp(plugins)) plugin_count = len(plugins) update_count = 0 for plugin in plugins: if plugin['hasUpdate']: update_count += 1 self.msg += " {0} plugin update{1} available out of {2} installed plugin{3}".format( update_count, plural(update_count), plugin_count, plural(plugin_count)) if update_count: self.warning() self.msg += ' | updates_available={0};1 plugins_installed={1} query_time={2:.4f}s'.format( update_count, plugin_count, query_time)
def run(self): url = 'https://api.travis-ci.org/repos/{repo}/builds'.format(repo=self.repo) request_handler = RequestHandler() req = request_handler.get(url) if log.isEnabledFor(logging.DEBUG): log.debug("\n%s", jsonpp(req.content)) try: self.parse_results(req.content) except (KeyError, ValueError): exception = traceback.format_exc().split('\n')[-2] # this covers up the traceback info and makes it harder to debug #raise UnknownError('failed to parse expected json response from Travis CI API: {0}'.format(exception)) qquit('UNKNOWN', 'failed to parse expected json response from Travis CI API: {0}. {1}'. format(exception, support_msg_api()))
def run(self): iam = boto3.client('iam') log.info('getting account summary') _ = iam.get_account_summary() log.debug('%s', jsonpp(_)) account_summary = _['SummaryMap'] mfa_enabled = account_summary['AccountMFAEnabled'] access_keys = account_summary['AccountAccessKeysPresent'] if access_keys or not mfa_enabled: self.warning() self.msg = 'AWS root account MFA enabled = {}{}'.format( bool(mfa_enabled), ' (!)' if not mfa_enabled else "") self.msg += ', {} access key{} found{}'.format( access_keys, plural(access_keys), ' (!)' if access_keys else "")
def check(self, client): log.info('running Docker info') info = client.info() if log.isEnabledFor(logging.DEBUG): log.debug(jsonpp(info)) swarm = info['Swarm'] if 'Cluster' not in swarm: raise CriticalError('Docker is not a member of a Swarm') error = swarm['Error'] if error: self.critical() else: error = 'None' self.msg = 'Docker Swarm error = {}'.format(error)
def run(self): client = boto3.client('cloudtrail') log.info('describing cloud trails') _ = client.describe_trails() log.debug('%s', jsonpp(_)) trail_list = _['trailList'] num_trails = len(trail_list) log.info('found %s trails', num_trails) if self.get_opt('list_trails'): print('Cloud Trails:\n') for trail in trail_list: print(trail['Name']) sys.exit(ERRORS['UNKNOWN']) if self.trail_name: trail_info = None for trail in trail_list: name = trail['Name'] if self.trail_name and self.trail_name != name: continue is_multi_region = trail['IsMultiRegionTrail'] is_logfile_validation = trail['LogFileValidationEnabled'] trail_info = client.get_trail_status(Name=name) log.debug('%s', jsonpp(trail_info)) if not trail_info: raise CriticalError('info for trail \'{}\' not found'.format( self.trail_name)) is_logging = trail_info['IsLogging'] if not is_logging: self.warning() elif not is_multi_region and not self.no_multi_region: self.warning() elif not is_logfile_validation and not self.no_logfile_validation: self.warning() self.msg = 'AWS cloudtrail \'{}\' logging: {}, multi-region: {}, logfile-validation-enabled: {}'\ .format(self.trail_name, is_logging, is_multi_region, is_logfile_validation) else: self.check_trails(client, trail_list)
def get_version(self): content = self.get() try: json_list = json.loads(content) if log.isEnabledFor(logging.DEBUG): print(jsonpp(json_list)) print('=' * 80) if not isList(json_list): raise ValueError( "non-list returned by API (is type '{0}')".format( type(json_list))) json_dict = json_list[0] if not isDict(json_dict): raise ValueError( "non-dict found inside returned list (is type '{0}')". format(type(json_dict))) company_name = json_dict['company_name'] company_website = json_dict['company_website'] regex = re.compile(r'Blue\s*Talon', re.I) if not regex.match(company_name) and \ not regex.match(company_website): qquit('UNKNOWN', 'Blue Talon name was not found in either company_name or company_website fields' \ + ', are you definitely querying a Blue Talon server?') build_version = json_dict['build_version'] update_date = json_dict['update_date'] api_version = json_dict['api_version'] if not isVersion(api_version): qquit('UNKNOWN', '{0} api version unrecognized \'{1}\'. {2}'\ .format(self.software, api_version, support_msg_api())) if api_version != self.api_version: qquit('UNKNOWN', "unexpected API version '{0}' returned (expected '{1}')"\ .format(api_version, self.api_version)) if self.verbose: extra_info = ' revision {revision} build {build}, schema revision = {schema_revision}'\ .format(revision=json_dict['revision_no'], build=json_dict['build_no'], schema_revision=json_dict['schema_revision']) extra_info += ', api version = {api_version}, update date = {update_date}'\ .format(api_version=api_version, update_date=update_date) else: extra_info = ', update date = {update_date}'.format( update_date=update_date) except (KeyError, ValueError) as _: qquit('UNKNOWN', 'error parsing output from {software}: {exception}: {error}. {support_msg}'\ .format(software=self.software, exception=type(_).__name__, error=_, support_msg=support_msg_api())) return (build_version, extra_info)
def parse_latest_failed_build(self, content): log.debug('parsing latest failed build info') build = None json_data = json.loads(content) if not json_data or \ 'builds' not in json_data or \ not json_data['builds']: qquit( 'UNKNOWN', "no Travis CI builds returned by the Travis API." + " Either the specified repo '{0}' doesn't exist".format( self.repo) + " or no builds have happened yet?" + " Also remember the repo is case sensitive, for example 'harisekhon/nagios-plugins' returns this" + " blank build set whereas 'HariSekhon/nagios-plugins' succeeds" + " in returning latest builds information") builds = json_data['builds'] # get latest finished failed build last_build_number = None found_newer_passing_build = False for _ in builds: # API returns most recent build first so just take the first one that is completed # extra check to make sure we're getting the very latest build number and API hasn't changed build_number = _['number'] if not isInt(build_number): raise UnknownError('build number returned is not an integer!') build_number = int(build_number) if last_build_number is None: last_build_number = int(build_number) + 1 if build_number >= last_build_number: raise UnknownError('build number returned is out of sequence, cannot be >= last build returned' + \ '{0}'.format(support_msg_api())) last_build_number = build_number if _['state'] == 'passed': if build is None and not found_newer_passing_build: log.warning("found more recent successful build #%s with state = '%s'" + \ ", you may not need to debug this build any more", _['number'], _['state']) found_newer_passing_build = True elif _['state'] in ('failed', 'errored'): if build is None: build = _ # by continuing to iterate through the rest of the builds we can check # their last_build numbers are descending for extra sanity checking #break if build is None: qquit('UNKNOWN', 'no recent failed builds found' + \ ', you may need to specify the --job-id explicitly as shown in the Travis CI UI') if log.isEnabledFor(logging.DEBUG): log.debug("latest failed build:\n%s", jsonpp(build)) return build
def fetch(self, url_suffix): err = '' try: response = self.get(url_suffix) except requests.exceptions.RequestException as _: err = "failed to fetch Ambari Blueprint from '%s': %s" % (self.url, _) # log.critical(err) qquit('CRITICAL', err) json_data = json.loads(response) if log.isEnabledFor(logging.DEBUG): log.debug("blueprint = " + jsonpp(json_data)) try: del json_data['href'] log.debug("stripped href as it's not valid if re-submitting the blueprint to Ambari") except KeyError as _: pass # Ambari 2.1.3 supports this according to: # https://cwiki.apache.org/confluence/display/AMBARI/Blueprints#Blueprints-ClusterCreationTemplateStructure # json_data['config_recommendation_strategy'] = 'NEVER_APPLY' # default # json_data['config_recommendation_strategy'] = 'ONLY_STACK_DEFAULTS_APPLY' # json_data['config_recommendation_strategy'] = 'ALWAYS_APPLY' if self.strip_config: log.info('stripping out config sections of blueprints to make more generic') try: del json_data['configurations'] for hostgroup in json_data['host_groups']: del hostgroup['configurations'] except KeyError as _: pass try: json_data['host_groups'] = list_sort_dicts_by_value(json_data['host_groups'], 'name') for hostgroup in json_data['host_groups']: hostgroup['components'] = list_sort_dicts_by_value(hostgroup['components'], 'name') except KeyError as _: qquit('CRITICAL', 'failed to sort blueprint: %s' % _) return jsonpp(json_data)
def check(self, client): log.info('running Docker info') info = client.info() if log.isEnabledFor(logging.DEBUG): log.debug(jsonpp(info)) swarm = info['Swarm'] if 'Cluster' not in swarm: raise CriticalError('Docker is not a member of a Swarm') if self.node_type == 'manager': nodes = swarm['Managers'] else: nodes = swarm['Nodes'] self.msg = 'Docker Swarm {} nodes = {}'.format(self.node_type, nodes) self.check_thresholds(nodes) self.msg += ' | docker_swarm_{}_nodes={}{}'.format(self.node_type, nodes, self.get_perf_thresholds('lower'))
def get_failing_job_id_from_build(self, build): log.info('getting failed job id for build %s', build['id']) if 'jobs' not in build: raise UnknownError('no jobs field found in build, {0}'.format(support_msg_api)) for _ in build['jobs']: _id = _['id'] url = 'https://api.travis-ci.org/jobs/{id}'.format(id=_id) req = self.request_handler.get(url) # if this raises ValueError it'll be caught by run handler job = json.loads(req.content) if log.isEnabledFor(logging.DEBUG): log.debug("job id %s status:\n%s", _id, jsonpp(job)) if job['state'] == 'finished' and job['status'] in (None, 1, '1'): return _id raise UnknownError('no failed job found in build {0}'.format(build['id']))
def parse_results(self, content): try: json_dict = json.loads(content) if log.isEnabledFor(logging.DEBUG): print(jsonpp(content)) print('='*80) # looks like syshealthok child div is only there in browser, but give syshealthspin in code #if soup.find('div', id='syshealthstatus').find('div', id='syshealthok'): if not isDict(json_dict): raise ValueError("non-dict returned by Attivio AIE server response (type was '{0}')"\ .format(type(json_dict))) # if this is true from warnings would ruin the more appropriate warnings check #if json_dict['haserrors']: # self.critical() # self.msg += 'errors detected, ' nodes_down = json_dict['nodesdown'] warnings = json_dict['warnings'] fatals = json_dict['fatals'] acknowledged = json_dict['acknowledged'] if not isInt(nodes_down): raise ValueError('non-integer returned for nodes down count by Attivio AIE') if not isInt(warnings): raise ValueError('non-integer returned for warnings count by Attivio AIE') if not isInt(fatals): raise ValueError('non-integer returned for fatals count by Attivio AIE') if not isInt(acknowledged): raise ValueError('non-integer returned for acknowledged count by Attivio AIE') nodes_down = int(nodes_down) warnings = int(warnings) fatals = int(fatals) acknowledged = int(acknowledged) if nodes_down > 0 or fatals > 0: self.critical() elif warnings > 0: self.warning() self.msg += '{nodes_down} nodes down, {fatals} fatals, {warnings} warnings, {acknowledged} acknowledged'\ .format(nodes_down=nodes_down, fatals=fatals, warnings=warnings, acknowledged=acknowledged) if json_dict['perfmondown']: self.warning() self.msg += ', warning: performance monitoring down' self.msg += ' | nodes_down={nodes_down} fatals={fatals} warnings={warnings} acknowledged={acknowledged}'\ .format(nodes_down=nodes_down, fatals=fatals, warnings=warnings, acknowledged=acknowledged) except (KeyError, ValueError) as _: qquit('UNKNOWN', 'error parsing output from {software}: {exception}: {error}. {support_msg}'\ .format(software=self.software, exception=type(_).__name__, error=_, support_msg=support_msg_api()))
def list_checks(self): self.path = '/api/3.1/checks' req = self.query() json_data = json.loads(req.content) if log.isEnabledFor(logging.DEBUG): log.debug('JSON prettified:\n\n%s\n%s', jsonpp(json_data), '=' * 80) print('Pingdom checks:\n') for check in json_data['checks']: print('{id}\t{name}\t{type}\t{hostname}\t{status}'.format( id=check['id'], name=check['name'], type=check['type'], hostname=check['hostname'], status=check['status'])) sys.exit(ERRORS['UNKNOWN'])
def check(self, client): log.info('running Docker info') info = client.info() if log.isEnabledFor(logging.DEBUG): log.debug(jsonpp(info)) swarm = info['Swarm'] if 'Cluster' not in swarm: raise CriticalError('Docker is not a member of a Swarm') if self.node_type == 'manager': nodes = swarm['Managers'] else: nodes = swarm['Nodes'] self.msg = 'Docker Swarm {} nodes = {}'.format(self.node_type, nodes) self.check_thresholds(nodes) self.msg += ' | docker_swarm_{}_nodes={}{}'.format( self.node_type, nodes, self.get_perf_thresholds())
def parse_json(self, json_data): apps = json_data['apps'] if not apps: raise CriticalError('no completed Yarn apps found') app_list = apps['app'] host_info = '' if self.verbose: host_info = " at '{0}:{1}'".format(self.host, self.port) if not isList(app_list): raise UnknownError("non-list returned for json_data[apps][app] by Yarn Resource Manager{0}"\ .format(host_info)) num_apps = len(app_list) log.info( "processing {0:d} running apps returned by Yarn Resource Manager{1}" .format(num_apps, host_info)) if num_apps > self.limit: raise UnknownError('num_apps {} > limit {}'.format( num_apps, self.limit)) if self.list_apps: self.print_apps(app_list) sys.exit(ERRORS['UNKNOWN']) matched_app = None regex = re.compile(self.app, re.I) for app in app_list: state = app['state'] if state in ('RUNNING', 'ACCEPTED'): continue if regex.search(app['name']): matched_app = app break if not matched_app: raise CriticalError("no finished app/job found with name matching '{app}' in list of last {limit} apps "\ .format(app=self.app, limit=self.limit) + "returned by Yarn Resource Manager{host_info}".format(host_info=host_info)) log.info('found matching app:\n\n%s\n', jsonpp(matched_app)) elapsed_time = self.check_app(matched_app) if self.warn_on_dup_app: log.info('checking for duplicate apps matching the same regex') count = 0 for app in app_list: if regex.match(app['name']): count += 1 if count > 1: self.msg += ', {0} DUPLICATE APPS WITH MATCHING NAMES DETECTED!'.format( count) self.msg += ' | app_elapsed_time={0}{1}'.format( elapsed_time, self.get_perf_thresholds())
def check(self, client): log.info('running Docker info') info = client.info() if log.isEnabledFor(logging.DEBUG): log.debug(jsonpp(info)) swarm = info['Swarm'] if 'Cluster' not in swarm: raise CriticalError('Docker is not a member of a Swarm') # this field isn't documented that I can see, originally thought it meant quorum # but doc says it mirrors 'docker info' command which has 'Is Manager: true' field control_available = swarm['ControlAvailable'] self.msg = 'Docker Swarm node is ' if control_available: self.msg += 'a manager' else: self.msg += 'not a manager' self.critical()
def run(self): url = 'https://api.travis-ci.org/repos/{repo}/builds'.format( repo=self.repo) request_handler = RequestHandler() req = request_handler.get(url) if log.isEnabledFor(logging.DEBUG): log.debug("\n%s", jsonpp(req.content)) try: self.parse_results(req.content) except (KeyError, ValueError): exception = traceback.format_exc().split('\n')[-2] # this covers up the traceback info and makes it harder to debug #raise UnknownError('failed to parse expected json response from Travis CI API: {0}'.format(exception)) qquit( 'UNKNOWN', 'failed to parse expected json response from Travis CI API: {0}. {1}' .format(exception, support_msg_api()))
def check_table_regions(self): log.info('checking regions for table \'%s\'', self.table) regions = None try: table = self.conn.table(self.table) regions = table.regions() except HBaseIOError as _: #if 'org.apache.hadoop.hbase.TableNotFoundException' in _.message: if 'TableNotFoundException' in _.message: qquit('CRITICAL', 'table \'{0}\' does not exist'.format(self.table)) else: qquit('CRITICAL', _) except (socket.error, socket.timeout, ThriftException) as _: qquit('CRITICAL', _) if log.isEnabledFor(logging.DEBUG): log.debug('%s', jsonpp(regions)) if not regions: qquit('CRITICAL', 'failed to get regions for table \'{0}\''.format(self.table)) if not isList(regions): qquit('UNKNOWN', 'region info returned is not a list! ' + support_msg_api()) num_regions = len(regions) log.info('num regions: %s', num_regions) self.msg = 'HBase table \'{0}\' has {1} region{2}'.format(self.table, num_regions, plural(num_regions)) self.check_thresholds(num_regions) num_unassigned_regions = 0 for region in regions: try: if not region['server_name']: #log.debug('region \'%s\' is not assigned to any server', region['name']) num_unassigned_regions += 1 except KeyError as _: qquit('UNKNOWN', 'failed to find server assigned to region. ' + support_msg_api()) log.info('num unassigned regions: %s', num_unassigned_regions) self.msg += ', {0} unassigned region{1}'.format(num_unassigned_regions, plural(num_unassigned_regions)) if num_unassigned_regions > 0: self.warning() self.msg += '!' self.msg += ' |' self.msg += ' num_regions={0}'.format(num_regions) + self.get_perf_thresholds(boundary='lower') self.msg += ' num_unassigned_regions={0};1;0'.format(num_unassigned_regions) log.info('finished, closing connection') self.conn.close()
def check_table_regions(self): log.info('checking regions for table \'%s\'', self.table) regions = None try: table = self.conn.table(self.table) regions = table.regions() except HBaseIOError as _: #if 'org.apache.hadoop.hbase.TableNotFoundException' in _.message: if 'TableNotFoundException' in _.message: qquit('CRITICAL', 'table \'{0}\' does not exist'.format(self.table)) else: qquit('CRITICAL', _) except (socket.timeout, ThriftException) as _: qquit('CRITICAL', _) if log.isEnabledFor(logging.DEBUG): log.debug('%s', jsonpp(regions)) if not regions: qquit('CRITICAL', 'failed to get regions for table \'{0}\''.format(self.table)) if not isList(regions): qquit('UNKNOWN', 'region info returned is not a list! ' + support_msg_api()) num_regions = len(regions) log.info('num regions: %s', num_regions) self.msg = 'HBase table \'{0}\' has {1} region{2}'.format(self.table, num_regions, plural(num_regions)) self.check_thresholds(num_regions) num_unassigned_regions = 0 for region in regions: try: if not region['server_name']: #log.debug('region \'%s\' is not assigned to any server', region['name']) num_unassigned_regions += 1 except KeyError as _: qquit('UNKNOWN', 'failed to find server assigned to region. ' + support_msg_api()) log.info('num unassigned regions: %s', num_unassigned_regions) self.msg += ', {0} unassigned region{1}'.format(num_unassigned_regions, plural(num_unassigned_regions)) if num_unassigned_regions > 0: self.warning() self.msg += '!' self.msg += ' |' self.msg += ' num_regions={0}'.format(num_regions) + self.get_perf_thresholds(boundary='lower') self.msg += ' num_unassigned_regions={0};1;0'.format(num_unassigned_regions) log.info('finished, closing connection') self.conn.close()
def get_latest_failed_build(self): log.info('getting latest failed build') # gets 404 unless replacing the slash url = 'https://api.travis-ci.org/repo/{repo}/builds'.format(repo=self.repo.replace('/', '%2F')) # request returns blank without authorization header req = self.request_handler.get(url, headers=self.headers) if log.isEnabledFor(logging.DEBUG): log.debug("\n%s", jsonpp(req.content)) try: latest_build = self.parse_latest_failed_build(req.content) except (KeyError, ValueError): exception = traceback.format_exc().split('\n')[-2] # this covers up the traceback info and makes it harder to debug #raise UnknownError('failed to parse expected json response from Travis CI API: {0}'.format(exception)) qquit('UNKNOWN', 'failed to parse expected json response from Travis CI API: {0}. {1}'. format(exception, support_msg_api())) return latest_build
def check(self, client): # images = client.images.list() # print(images) try: image = client.images.get(self.docker_image) except docker.errors.APIError as _: raise CriticalError(_) if log.isEnabledFor(logging.DEBUG): log.debug(jsonpp(image.attrs)) _id = image.short_id #size = image.attrs['Size'] size = image.attrs['VirtualSize'] self.msg = "Docker image '{repo}'".format(repo=self.docker_image) self.check_id(_id) self.check_size(size)
def parse(json_data): try: # it's already nicely layed out #if log.isEnabledFor(logging.DEBUG): # log.debug('%s', jsonpp(json_data)) compaction_queue_size = None for bean in json_data['beans']: if bean['name'] == 'Hadoop:service=HBase,name=RegionServer,sub=Server': if log.isEnabledFor(logging.DEBUG): log.debug('found RegionServer section:') log.debug('%s', jsonpp(bean)) compaction_queue_size = bean['compactionQueueLength'] if not isInt(compaction_queue_size): qquit('UNKNOWN', 'non-integer returned for compactionQueueLength! ' + support_msg_api()) return compaction_queue_size except KeyError as _: qquit('UNKNOWN', _ + ': failed to parse HBase Master jmx info. ' + support_msg_api()) qquit('UNKNOWN', 'RegionServer mbean not found, double check this is pointing to an HBase RegionServer')
def get_version(self): content = self.get() try: json_list = json.loads(content) if log.isEnabledFor(logging.DEBUG): print(jsonpp(json_list)) print('='*80) if not isList(json_list): raise ValueError("non-list returned by API (is type '{0}')".format(type(json_list))) json_dict = json_list[0] if not isDict(json_dict): raise ValueError("non-dict found inside returned list (is type '{0}')".format(type(json_dict))) company_name = json_dict['company_name'] company_website = json_dict['company_website'] regex = re.compile(r'Blue\s*Talon', re.I) if not regex.match(company_name) and \ not regex.match(company_website): qquit('UNKNOWN', 'Blue Talon name was not found in either company_name or company_website fields' \ + ', are you definitely querying a Blue Talon server?') build_version = json_dict['build_version'] update_date = json_dict['update_date'] api_version = json_dict['api_version'] if not isVersion(api_version): qquit('UNKNOWN', '{0} api version unrecognized \'{1}\'. {2}'\ .format(self.software, api_version, support_msg_api())) if api_version != self.api_version: qquit('UNKNOWN', "unexpected API version '{0}' returned (expected '{1}')"\ .format(api_version, self.api_version)) if self.verbose: extra_info = ' revision {revision} build {build}, schema revision = {schema_revision}'\ .format(revision=json_dict['revision_no'], build=json_dict['build_no'], schema_revision=json_dict['schema_revision']) extra_info += ', api version = {api_version}, update date = {update_date}'\ .format(api_version=api_version, update_date=update_date) else: extra_info = ', update date = {update_date}'.format(update_date=update_date) except (KeyError, ValueError) as _: qquit('UNKNOWN', 'error parsing output from {software}: {exception}: {error}. {support_msg}'\ .format(software=self.software, exception=type(_).__name__, error=_, support_msg=support_msg_api())) return (build_version, extra_info)