def process_build_info(self, build_info): displayname = build_info['displayName'] duration = build_info['duration'] if not isInt(duration): raise UnknownError('duration field returned non-integer! {0}'.format(support_msg_api())) duration = int(duration) / 1000 result = build_info['result'] timestamp = build_info['timestamp'] if not isInt(timestamp): raise UnknownError('timestamp field returned non-integer! {0}'.format(support_msg_api())) timestamp = int(timestamp) building = build_info['building'] self.msg += "build {build} status: ".format(build=displayname) if building: self.unknown() self.msg += 'STILL BUILDING!' return self.msg += result if result != 'SUCCESS': self.critical() self.msg += ', duration={duration} secs'.format(duration=duration) self.check_thresholds(duration) age = time.time() - (timestamp/1000) self.msg += ', age={age} secs'.format(age=sec2human(age)) if age < 0: self.warning() self.msg += ' (< 0!)' if self.age and age > self.age: self.critical() self.msg += ' (> {0:d})'.format(self.age) self.msg += ' | build_duration={duration}s{perf_thresholds}'.format(duration=duration, \ perf_thresholds=self.get_perf_thresholds())
def check_app(self, app): state = app['state'] user = app['user'] queue = app['queue'] running_containers = app['runningContainers'] elapsed_time = app['elapsedTime'] assert isInt(running_containers, allow_negative=True) assert isInt(elapsed_time) running_containers = int(running_containers) elapsed_time = int(elapsed_time / 1000) self.msg = "Yarn application '{0}' state = '{1}'".format(app['name'], state) if state != 'RUNNING': self.critical() ################## # This shouldn't be used any more now using more targeted query to only return running apps # state = FAILED / KILLED also gets final status = FAILED KILLED, no point double printing if state == 'FINISHED': self.msg += ", final status = '{0}'".format(app['finalStatus']) ################## self.msg += ", user = '******'".format(user) if self.app_user is not None and self.app_user != user: self.critical() self.msg += " (expected '{0}')".format(self.app_user) self.msg += ", queue = '{0}'".format(queue) if self.queue is not None and self.queue != queue: self.critical() self.msg += " (expected '{0}')".format(self.queue) self.msg += ", running containers = {0}".format(running_containers) if self.min_containers is not None and running_containers < self.min_containers: self.critical() self.msg += " (< '{0}')".format(self.min_containers) self.msg += ", elapsed time = {0} secs".format(elapsed_time) self.check_thresholds(elapsed_time) return elapsed_time
def parse_results(self, content): try: json_dict = json.loads(content) if log.isEnabledFor(logging.DEBUG): print(jsonpp(content)) print('=' * 80) # looks like syshealthok child div is only there in browser, but give syshealthspin in code #if soup.find('div', id='syshealthstatus').find('div', id='syshealthok'): if not isDict(json_dict): raise ValueError("non-dict returned by Attivio AIE server response (type was '{0}')"\ .format(type(json_dict))) # if this is true from warnings would ruin the more appropriate warnings check #if json_dict['haserrors']: # self.critical() # self.msg += 'errors detected, ' nodes_down = json_dict['nodesdown'] warnings = json_dict['warnings'] fatals = json_dict['fatals'] acknowledged = json_dict['acknowledged'] if not isInt(nodes_down): raise ValueError( 'non-integer returned for nodes down count by Attivio AIE') if not isInt(warnings): raise ValueError( 'non-integer returned for warnings count by Attivio AIE') if not isInt(fatals): raise ValueError( 'non-integer returned for fatals count by Attivio AIE') if not isInt(acknowledged): raise ValueError( 'non-integer returned for acknowledged count by Attivio AIE' ) nodes_down = int(nodes_down) warnings = int(warnings) fatals = int(fatals) acknowledged = int(acknowledged) if nodes_down > 0 or fatals > 0: self.critical() elif warnings > 0: self.warning() self.msg += '{nodes_down} nodes down, {fatals} fatals, {warnings} warnings, {acknowledged} acknowledged'\ .format(nodes_down=nodes_down, fatals=fatals, warnings=warnings, acknowledged=acknowledged) if json_dict['perfmondown']: self.warning() self.msg += ', warning: performance monitoring down' self.msg += ' | nodes_down={nodes_down} fatals={fatals} warnings={warnings} acknowledged={acknowledged}'\ .format(nodes_down=nodes_down, fatals=fatals, warnings=warnings, acknowledged=acknowledged) except (KeyError, ValueError) as _: qquit('UNKNOWN', 'error parsing output from {software}: {exception}: {error}. {support_msg}'\ .format(software=self.software, exception=type(_).__name__, error=_, support_msg=support_msg_api()))
def parse_json(self, json_data): if self.list_jobs: print('Jenkins Jobs:\n') for job in json_data['jobs']: print(job['name']) sys.exit(ERRORS['UNKNOWN']) if 'lastCompletedBuild' in json_data: last_completed_build = json_data['lastCompletedBuild'] if not last_completed_build: raise WarningError( "job '{job}' not built yet".format(job=self.job)) self.path = '/job/{job}/{number}/api/json'.format( job=self.job, number=last_completed_build['number']) req = self.query() self.process_json(req.content) return displayname = json_data['displayName'] duration = json_data['duration'] if not isInt(duration): raise UnknownError( 'duration field returned non-integer! {0}'.format( support_msg_api())) duration = int(duration) / 1000 result = json_data['result'] timestamp = json_data['timestamp'] if not isInt(timestamp): raise UnknownError( 'timestamp field returned non-integer! {0}'.format( support_msg_api())) timestamp = int(timestamp) building = json_data['building'] self.msg += "build {build} status: ".format(build=displayname) if building: self.unknown() self.msg += 'STILL BUILDING!' return self.msg += result if result != 'SUCCESS': self.critical() self.msg += ', duration={duration} secs'.format(duration=duration) self.check_thresholds(duration) age = time.time() - (timestamp / 1000) self.msg += ', age={age} secs'.format(age=sec2human(age)) if age < 0: self.warning() self.msg += ' (< 0!)' if self.age and age > self.age: self.critical() self.msg += ' (> {0:d})'.format(self.age) self.msg += ' | build_duration={duration}s{perf_thresholds}'.format( duration=duration, perf_thresholds=self.get_perf_thresholds())
def parse_results(self, content): try: json_dict = json.loads(content) if log.isEnabledFor(logging.DEBUG): print(jsonpp(content)) print('='*80) # looks like syshealthok child div is only there in browser, but give syshealthspin in code #if soup.find('div', id='syshealthstatus').find('div', id='syshealthok'): if not isDict(json_dict): raise ValueError("non-dict returned by Attivio AIE server response (type was '{0}')"\ .format(type(json_dict))) # if this is true from warnings would ruin the more appropriate warnings check #if json_dict['haserrors']: # self.critical() # self.msg += 'errors detected, ' nodes_down = json_dict['nodesdown'] warnings = json_dict['warnings'] fatals = json_dict['fatals'] acknowledged = json_dict['acknowledged'] if not isInt(nodes_down): raise ValueError('non-integer returned for nodes down count by Attivio AIE') if not isInt(warnings): raise ValueError('non-integer returned for warnings count by Attivio AIE') if not isInt(fatals): raise ValueError('non-integer returned for fatals count by Attivio AIE') if not isInt(acknowledged): raise ValueError('non-integer returned for acknowledged count by Attivio AIE') nodes_down = int(nodes_down) warnings = int(warnings) fatals = int(fatals) acknowledged = int(acknowledged) if nodes_down > 0 or fatals > 0: self.critical() elif warnings > 0: self.warning() self.msg += '{nodes_down} nodes down, {fatals} fatals, {warnings} warnings, {acknowledged} acknowledged'\ .format(nodes_down=nodes_down, fatals=fatals, warnings=warnings, acknowledged=acknowledged) if json_dict['perfmondown']: self.warning() self.msg += ', warning: performance monitoring down' self.msg += ' | nodes_down={nodes_down} fatals={fatals} warnings={warnings} acknowledged={acknowledged}'\ .format(nodes_down=nodes_down, fatals=fatals, warnings=warnings, acknowledged=acknowledged) except (KeyError, ValueError) as _: qquit('UNKNOWN', 'error parsing output from {software}: {exception}: {error}. {support_msg}'\ .format(software=self.software, exception=type(_).__name__, error=_, support_msg=support_msg_api()))
def mac_getent_passwd_user(self, user): log.info('mac_getent_passwd_user(%s)', user) command = 'dscl . -read /Users/{user}'.format(user=user) (output, returncode) = self.cmd(command) user = password = uid = gid = name = homedir = shell = '' #log.info('parsing output for passwd conversion') output = output.split('\n') for (index, line) in enumerate(output): tokens = line.split() if len(tokens) < 1: continue field = tokens[0] if len(tokens) < 2: value = '' else: value = tokens[1] if field == 'RecordName:': user = value elif field == 'Password:'******'x' elif field == 'UniqueID:': uid = value elif field == 'PrimaryGroupID:': gid = value elif field == 'RealName:': name = value if not value and len(output) > index + 1 and output[ index + 1].startswith(' '): name = output[index + 1].strip() elif not name and field == 'RecordName:': name = value elif field == 'NFSHomeDirectory:': homedir = value elif field == 'UserShell:': shell = value if not user: return ('', returncode) getent_record = '{user}:{password}:{uid}:{gid}:{name}:hotexamples_com:{shell}'.format\ (user=user, password=password, uid=uid, gid=gid, name=name, homedir=homedir, shell=shell) if not isInt(uid, allow_negative=True): die("parsing error: UID '{uid}' is not numeric in record {record}!" .format(uid=uid, record=getent_record)) if not isInt(gid, allow_negative=True): die("parsing error: GID '{gid}' is not numeric in record {record}!" .format(gid=gid, record=getent_record)) return (getent_record, returncode)
def check_file(self, filename): log.debug('checking file \'%s\'', filename) match = self.regex.search(os.path.basename(filename)) if not match: log.debug('no numeric regex match for file, probably not a sequential file' + \ ', skipping \'%s\'', filename) return # will error out here if you've supplied your own regex without capture brackets # or if you've got pre-captures - let this bubble to user to fix their regex file_prefix = os.path.join(os.path.dirname(filename), match.group(1)) file_number = match.group(2) file_suffix = match.group(3) if not isInt(file_number): raise UnknownError('regex captured non-float for filename: {}'.format(filename)) if file_prefix is None: file_prefix = '' if file_suffix is None: file_suffix = '' padding = len(file_number) file_number = int(file_number) while file_number > 1: file_number = self.determine_missing_file_backfill(file_prefix, file_number, padding, file_suffix) if self.missing_files: print('\n'.join(reversed(self.missing_files))) self.missing_files = []
def parse_json(self, json_data): log.info('parsing response') try: live_nodes = json_data['beans'][0]['LiveNodes'] live_node_data = json.loads(live_nodes) num_datanodes = len(live_node_data) if num_datanodes < 1: raise UnknownError("no live datanodes returned by JMX API from namenode '{0}:{1}'"\ .format(self.host, self.port)) min_space = None max_space = 0 for datanode in live_node_data: used_space = live_node_data[datanode]['usedSpace'] if not isInt(used_space): raise UnknownError( 'usedSpace is not an integer! {0}'.format( support_msg_api())) used_space = int(used_space) log.info("datanode '%s' used space = %s", datanode, used_space) if min_space is None or used_space < min_space: min_space = used_space if used_space > max_space: max_space = used_space divisor = max_space if divisor < 1: log.info( 'min used space < 1, resetting divisor to 1 (% will likely be very high)' ) divisor = 1 assert max_space >= min_space largest_imbalance_pc = float('{0:.2f}'.format( ((max_space - min_space) / divisor) * 100)) assert largest_imbalance_pc >= 0 self.ok() self.msg = '{0}% HDFS imbalance on space used'.format( largest_imbalance_pc) self.check_thresholds(largest_imbalance_pc) self.msg += ' across {0:d} datanode{1}'.format( num_datanodes, plural(num_datanodes)) if self.verbose: self.msg += ', min used space = {0}, max used space = {1}'.format( min_space, max_space) if self.verbose and (self.is_warning() or self.is_critical()): self.msg += ' [imbalanced nodes: ' for datanode in live_node_data: used_space = live_node_data[datanode]['usedSpace'] if (used_space / max_space * 100) > self.thresholds['warning']['upper']: self.msg += '{0}({1:.2f%}),'.format( datanode, used_space) self.msg = self.msg.rstrip(',') + ']' self.msg += " | 'HDFS imbalance on space used %'={0}".format( largest_imbalance_pc) self.msg += self.get_perf_thresholds() self.msg += " num_datanodes={0}".format(num_datanodes) self.msg += " min_used_space={0}".format(min_space) self.msg += " max_used_space={0}".format(max_space) except KeyError as _: raise UnknownError("failed to parse json returned by NameNode at '{0}:{1}': {2}. {3}"\ .format(self.host, self.port, _, support_msg_api()))
def __parse_args__(self): try: (self.options, self.args) = self.__parser.parse_args() # I don't agree with zero exit code from OptionParser for help/usage, # and want UNKNOWN not CRITICAL(2) for switch mis-usage... except SystemExit: # pragma: no cover sys.exit(ERRORS['UNKNOWN']) if self.options.help: # pragma: no cover self.usage() if self.options.version: # pragma: no cover print('%(version)s' % self.__dict__) sys.exit(ERRORS['UNKNOWN']) if 'timeout' in dir(self.options): self.timeout = self.get_opt('timeout') env_verbose = os.getenv('VERBOSE') if isInt(env_verbose): if env_verbose > self.verbose: log.debug('environment variable $VERBOSE = %s, increasing verbosity', env_verbose) self.verbose = env_verbose elif env_verbose is None: pass else: log.warning("$VERBOSE environment variable is not an integer ('%s')", env_verbose) self.parse_args() return self.options, self.args
def timeout_max(self, secs): if secs is not None and not isInt(secs): raise CodingError('invalid timeout max passed to set_timeout_max(), must be an integer representing seconds') # pylint: disable=line-too-long # leave this to be able to set max to any amount # validate_int(secs, 'timeout default', 0, self.__timeout_max ) log.debug('setting max timeout to %s secs', secs) self.__timeout_max = secs
def run(self): url = '{protocol}://{host}:{port}/rest/ingestApi/getSessionCount'.format( host=self.host, port=self.port, protocol=self.protocol) log.debug('GET %s', url) try: req = requests.get(url) except requests.exceptions.RequestException as _: errhint = '' if 'BadStatusLine' in str(_.message): errhint = ' (possibly connecting to an SSL secured port without using --ssl?)' elif self.protocol == 'https' and 'unknown protocol' in str( _.message): errhint = ' (possibly connecting to a plain HTTP port with the -S / --ssl switch enabled?)' qquit('CRITICAL', str(_) + errhint) log.debug("response: %s %s", req.status_code, req.reason) log.debug("content:\n%s\n%s\n%s", '=' * 80, req.content.strip(), '=' * 80) if req.status_code != 200: qquit('CRITICAL', '{0} {1}'.format(req.status_code, req.reason)) try: count = req.content.strip() if not isInt(count): raise ValueError('non-integer value returned by Attivio AIE') count = int(count) self.msg = '{software} ingest session count = {count}'.format( software=self.software, count=count) self.check_thresholds(count) except (KeyError, ValueError): qquit('UNKNOWN', 'error parsing output from {software}: {exception}: {error}. {support_msg}'\ .format(software=self.software, exception=type(_).__name__, error=_, support_msg=support_msg_api())) self.msg += ' | ingest_session_count={0:d}{thresholds}'.format( count, thresholds=self.get_perf_thresholds())
def parse(self, content): # could also collect lines after 'Regions-in-transition' if parsing /dump # sample: # hbase:meta,,1.1588230740 state=PENDING_OPEN, \ # ts=Tue Nov 24 08:26:45 UTC 2015 (1098s ago), server=amb2.service.consul,16020,1448353564099 soup = BeautifulSoup(content, 'html.parser') #if log.isEnabledFor(logging.DEBUG): # log.debug("BeautifulSoup prettified:\n%s\n%s", soup.prettify(), '='*80) # looks like HMaster UI doesn't print this section if there are no regions in transition, must assume zero regions_in_transition = 0 try: headings = soup.findAll('h2') for heading in headings: log.debug("checking heading '%s'", heading) if heading.get_text() == "Regions in Transition": log.debug('found Regions in Transition section header') table = heading.find_next('table') log.debug('checking first following table') regions_in_transition = self.parse_table(table) if not isInt(regions_in_transition): qquit('UNKNOWN', 'parse error - ' + 'got non-integer \'{0}\' for regions in transition when parsing HMaster UI'\ .format(regions_in_transition)) return regions_in_transition #qquit('UNKNOWN', 'parse error - failed to find table data for regions in transition') except (AttributeError, TypeError): qquit( 'UNKNOWN', 'failed to parse HBase Master UI status page. ' + support_msg())
def process_rows(rows): longest_rit_time = None # will skip header anyway when it doesn't find td (will contain th instead) # this will avoid accidentally skipping a row later if the input changes to rows[1:] instead of rows #for row in rows[1:]: for row in rows: print(row) cols = row.findChildren('td') # Regions in Transition rows only have 2 cols # <hex> region rows have Region, State, RIT time (ms) num_cols = len(cols) if num_cols == 0: # header row continue elif num_cols != 3: qquit('UNKNOWN', 'unexpected number of columns ({0}) '.format(num_cols) + 'for regions in transition table. ' + support_msg()) if 'Regions in Transition' in cols[0].get_text(): continue rit_time = cols[2].get_text().strip() if not isInt(rit_time): qquit('UNKNOWN', 'parsing failed, got region in transition time of ' + "'{0}', expected integer".format(rit_time)) rit_time = int(rit_time) if rit_time > longest_rit_time: longest_rit_time = rit_time return longest_rit_time
def __parse_verbose__(self): self.verbose += int(self.get_opt('verbose')) env_verbose = os.getenv('VERBOSE') if isInt(env_verbose): if env_verbose > self.verbose: log.debug( 'environment variable $VERBOSE = %s, increasing verbosity', env_verbose) self.verbose = int(env_verbose) elif env_verbose is None: pass else: log.warning( "$VERBOSE environment variable is not an integer ('%s')", env_verbose) if self.is_option_defined('quiet') and self.get_opt('quiet'): self.verbose = 0 elif self.verbose > 2: log.setLevel(logging.DEBUG) elif self.verbose > 1: log.setLevel(logging.INFO) elif self.verbose > 0 and self._prog[0:6] != 'check_': log.setLevel(logging.WARN) if self.options.debug: log.setLevel(logging.DEBUG) # pragma: no cover log.debug('enabling debug logging') if self.verbose < 3: self.verbose = 3
def parse_json(self, json_data): log.info('parsing response') try: bean = json_data['beans'][0] space_used_pc = bean['PercentUsed'] # the way below is more informative #assert type(space_used_pc) == float if re.search(r'e-\d+$', str(space_used_pc)): space_used_pc = 0 if not isFloat(space_used_pc): raise UnknownError("non-float returned for PercentUsed by namenode '{0}:{1}'"\ .format(self.host, self.port)) assert space_used_pc >= 0 stats = {} for stat in ('Total', 'TotalBlocks', 'TotalFiles', 'Used'): stats[stat] = bean[stat] if not isInt(stats[stat]): raise UnknownError("non-integer returned for {0} by namenode '{1}:{2}'"\ .format(stat, self.host, self.port)) stats[stat] = int(stats[stat]) self.ok() self.msg = 'HDFS space used = {0:.2f}% ({1}/{2})'\ .format(space_used_pc, humanize.naturalsize(stats['Used']), humanize.naturalsize(stats['Total'])) self.check_thresholds(space_used_pc) self.msg += ", in {0:d} files spread across {1:d} blocks".format(stats['TotalFiles'], stats['TotalBlocks']) self.msg += " | 'HDFS % space used'={0:f}%{1}".format(space_used_pc, self.get_perf_thresholds()) self.msg += " 'HDFS space used'={0:d}b".format(stats['Used']) self.msg += " 'HDFS file count'={0:d}".format(stats['TotalFiles']) self.msg += " 'HDFS block count'={0:d}".format(stats['TotalBlocks']) except KeyError as _: raise UnknownError("failed to parse json returned by NameNode at '{0}:{1}': {2}. {3}"\ .format(self.host, self.port, _, support_msg_api())) except ValueError as _: raise UnknownError("invalid json returned for LiveNodes by Namenode '{0}:{1}': {2}"\ .format(self.host, self.port, _))
def parse(self, req): soup = BeautifulSoup(req.content, 'html.parser') last_heartbeat = None try: self.list_workers(soup) heartbeat_col_header = soup.find( 'th', text='Node Name').find_next_sibling().get_text() # make sure ordering of columns is as we expect so we're parsing the correct number for heartbeat lag if heartbeat_col_header != 'Last Heartbeat': code_error( "heartbeat column header '{}' != Last Heartbeat".format( heartbeat_col_header)) last_heartbeat = soup.find( 'th', text=self.node).find_next_sibling().get_text() if last_heartbeat is None: raise AttributeError except (AttributeError, TypeError): raise CriticalError("{0} worker '{1}' not found among list of live workers!"\ .format(self.software, self.node)) if not isInt(last_heartbeat): raise UnknownError("last heartbeat '{0}' for node '{1}' is not an integer, possible parsing error! {2}"\ .format(last_heartbeat, self.node, support_msg())) self.msg = "{0} worker '{1}' last heartbeat = {2} secs ago".format( self.software, self.node, last_heartbeat) self.check_thresholds(last_heartbeat) self.msg += ' | last_heartbeat={0}s{1}'.format( last_heartbeat, self.get_perf_thresholds())
def parse(self, content): # could also collect lines after 'Regions-in-transition' if parsing /dump # sample: # hbase:meta,,1.1588230740 state=PENDING_OPEN, \ # ts=Tue Nov 24 08:26:45 UTC 2015 (1098s ago), server=amb2.service.consul,16020,1448353564099 soup = BeautifulSoup(content, 'html.parser') #if log.isEnabledFor(logging.DEBUG): # log.debug("BeautifulSoup prettified:\n%s\n%s", soup.prettify(), '='*80) # looks like HMaster UI doesn't print this section if there are no regions in transition, must assume zero regions_stuck_in_transition = 0 try: headings = soup.findAll('h2') for heading in headings: log.debug("checking heading '%s'", heading) if heading.get_text() == "Regions in Transition": log.debug('found Regions in Transition section header') table = heading.find_next('table') log.debug('checking first following table') regions_stuck_in_transition = self.parse_table(table) if not isInt(regions_stuck_in_transition): qquit('UNKNOWN', 'parse error - ' + 'got non-integer \'{0}\' for regions stuck in transition when parsing HMaster UI'\ .format(regions_stuck_in_transition)) return regions_stuck_in_transition #qquit('UNKNOWN', 'parse error - failed to find table data for regions stuck in transition') except (AttributeError, TypeError): qquit('UNKNOWN', 'failed to parse HBase Master UI status page. ' + support_msg())
def run(self): self.no_args() host = self.get_opt('host') port = self.get_opt('port') validate_host(host) validate_port(port) # observed bug in HDP 2.3 (HBase 1.1.2) where the JMX metric from HMaster UI /jmx is displaying 0 for beans # [ {"name":"Hadoop:service=HBase,name=Master,sub=AssignmentManger", ..., "ritCountOverThreshold" : 0 } # https://issues.apache.org/jira/browse/HBASE-16636 #url = 'http://%(host)s:%(port)s/jmx' % locals() # could get info from flat txt debug page but it doesn't contain the summary count #url = 'http://%(host)s:%(port)s/dump' % locals() url = 'http://%(host)s:%(port)s/master-status' % locals() log.debug('GET %s', url) try: req = requests.get(url) except requests.exceptions.RequestException as _: qquit('CRITICAL', _) log.debug("response: %s %s", req.status_code, req.reason) log.debug("content:\n%s\n%s\n%s", '='*80, req.content.strip(), '='*80) if req.status_code != 200: qquit('CRITICAL', "%s %s" % (req.status_code, req.reason)) regions_stuck_in_transition = self.parse(req.content) if regions_stuck_in_transition is None: qquit('UNKNOWN', 'parse error - failed to find number for regions stuck in transition') if not isInt(regions_stuck_in_transition): qquit('UNKNOWN', 'parse error - got non-integer for regions stuck in transition when parsing HMaster UI') if regions_stuck_in_transition == 0: self.ok() else: self.critical() self.msg = '{0} regions stuck in transition (ie. transitioning longer than HBase threshold)'\ .format(regions_stuck_in_transition) self.msg += " | regions_stuck_in_transition={0};0;0".format(regions_stuck_in_transition)
def run(self): url = '{protocol}://{host}:{port}/rest/ingestApi/getSessionCount'.format(host=self.host, port=self.port, protocol=self.protocol) log.debug('GET %s', url) try: req = requests.get(url) except requests.exceptions.RequestException as _: errhint = '' if 'BadStatusLine' in str(_.message): errhint = ' (possibly connecting to an SSL secured port without using --ssl?)' elif self.protocol == 'https' and 'unknown protocol' in str(_.message): errhint = ' (possibly connecting to a plain HTTP port with the -S / --ssl switch enabled?)' qquit('CRITICAL', str(_) + errhint) log.debug("response: %s %s", req.status_code, req.reason) log.debug("content:\n%s\n%s\n%s", '='*80, req.content.strip(), '='*80) if req.status_code != 200: qquit('CRITICAL', '{0} {1}'.format(req.status_code, req.reason)) try: count = req.content.strip() if not isInt(count): raise ValueError('non-integer value returned by Attivio AIE') count = int(count) self.msg = '{software} ingest session count = {count}'.format(software=self.software, count=count) self.check_thresholds(count) except (KeyError, ValueError): qquit('UNKNOWN', 'error parsing output from {software}: {exception}: {error}. {support_msg}'\ .format(software=self.software, exception=type(_).__name__, error=_, support_msg=support_msg_api())) self.msg += ' | ingest_session_count={0:d}{thresholds}'.format(count, thresholds=self.get_perf_thresholds())
def collapse_sql_fields(self, row): sql_index = self.indicies['sql_index'] sql_index2 = self.indicies['sql_index2'] object_index = self.indicies['object_index'] len_row = len(row) if len_row > self.len_headers: log.debug('collapsing fields in row: %s', row) # divide by 2 to account for this having been done twice in duplicated SQL operational text # Update: appears this broke as only 2nd occurence of SQL operational text field got split to new fields, # which is weird because the log shows both 1st and 2nd SQL text fields were double quoted difference = len_row - self.len_headers # seems first occurrence doesn't get split in some occurence, # wasn't related to open in newline universal mode though # if 2 fields after isn't the /user/hive/warehouse/blah.db then 1st SQL wasn't split # would have to regex /user/hive/warehouse/blah.db(?:/table)? #if not row[sql_index+2].endswith('.db'): # if object field is TABLE or DATABASE then 1st sql field wasn't split if row[object_index] not in ('TABLE', 'DATABASE'): difference /= 2 # slice indicies must be integers if not isInt(difference): raise AssertionError("difference in field length '{}' is not an integer for row: {}"\ .format(difference, row)) difference = int(difference) row[sql_index] = ','.join( [self.sql_decomment(_) for _ in row[sql_index:difference]]) row = row[:sql_index] + row[sql_index + difference:] row[sql_index2] = ','.join( [self.sql_decomment(_) for _ in row[sql_index2:difference]]) row = row[:sql_index2] + row[sql_index2 + difference:] log.debug('collapsed row: %s', row) else: log.debug('not collapsing row: %s', row) return row
def verbose_default(self, arg): if not isInt(arg): raise CodingError( 'invalid verbose level passed to verbose_default(), must be an integer' ) log.debug('setting default verbose to %s', arg) self.__verbose_default = int(arg)
def check_app_elapsed_times(self, app_list): num_apps_breaching_sla = 0 max_elapsed = 0 matching_apps = 0 max_threshold_msg = '' # save msg as check_thresholds appends to it which we want to reset in this case msg = self.msg for app in app_list: if not self.app_selector(app): continue name = app['name'] matching_apps += 1 elapsed_time = app['elapsedTime'] assert isInt(elapsed_time) elapsed_time = int(elapsed_time / 1000) threshold_msg = self.check_thresholds(elapsed_time) if threshold_msg: num_apps_breaching_sla += 1 log.info("app '%s' is breaching SLA", name) if elapsed_time > max_elapsed: max_elapsed = elapsed_time max_threshold_msg = threshold_msg if max_threshold_msg: max_threshold_msg = ' ' + max_threshold_msg # restore msg prefix as check_thresholds appends every threshold breach self.msg = msg return (num_apps_breaching_sla, matching_apps, max_elapsed, max_threshold_msg)
def __parse_timeout__(self): # reset this to none otherwise unit tests fail to take setting from timeout_default # use __timeout to bypass the property setter checks self.__timeout = None if 'timeout' in dir(self.options): timeout = self.get_opt('timeout') if timeout is not None: log.debug('getting --timeout value %s', self.timeout) self.timeout = timeout if self.timeout is None: env_timeout = os.getenv('TIMEOUT') log.debug('getting $TIMEOUT value %s', env_timeout) if env_timeout is not None: log.debug('env_timeout is not None') if isInt(env_timeout): log.debug( "environment variable $TIMEOUT = '%s' and timeout not already set, setting timeout = %s", env_timeout, env_timeout) self.timeout = int(env_timeout) else: log.warning( "$TIMEOUT environment variable is not an integer ('%s')", env_timeout) if self.timeout is None: log.debug('timeout not set, using default timeout %s', self.timeout_default) self.timeout = self.timeout_default
def check_app_elapsed_times(self, app_list): num_apps_breaching_sla = 0 max_elapsed = 0 matching_apps = 0 max_threshold_msg = '' # save msg as check_thresholds appends to it which we want to reset in this case msg = self.msg for app in app_list: if not self.app_selector(app): continue name = app['name'] matching_apps += 1 elapsed_time = app['elapsedTime'] if not isInt(elapsed_time): raise UnknownError( 'elapsed_time {} is not an integer!'.format(elapsed_time)) elapsed_time = int(elapsed_time / 1000) threshold_msg = self.check_thresholds(elapsed_time) if threshold_msg: num_apps_breaching_sla += 1 log.info("app '%s' is breaching SLA", name) if elapsed_time > max_elapsed: max_elapsed = elapsed_time max_threshold_msg = threshold_msg if max_threshold_msg: max_threshold_msg = ' ' + max_threshold_msg # restore msg prefix as check_thresholds appends every threshold breach self.msg = msg return (num_apps_breaching_sla, matching_apps, max_elapsed, max_threshold_msg)
def parse_builds(self, content): log.debug('parsing build info') build = None collected_builds = [] json_data = json.loads(content) if not json_data or \ 'builds' not in json_data or \ not json_data['builds']: qquit( 'UNKNOWN', "no Travis CI builds returned by the Travis API." + " Either the specified repo '{0}' doesn't exist".format( self.repo) + " or no builds have happened yet?" + " Also remember the repo is case sensitive, for example 'harisekhon/nagios-plugins' returns this" + " blank build set whereas 'HariSekhon/nagios-plugins' succeeds" + " in returning latest builds information") builds = json_data['builds'] # get latest finished failed build last_build_number = None found_newer_passing_build = False for _ in builds: # API returns most recent build first # extra check to make sure we're getting the very latest build number and API hasn't changed build_number = _['number'] if not isInt(build_number): raise UnknownError('build number returned is not an integer!') build_number = int(build_number) if last_build_number is None: last_build_number = int(build_number) + 1 if build_number >= last_build_number: raise UnknownError('build number returned is out of sequence, cannot be >= last build returned' + \ '{0}'.format(support_msg_api())) last_build_number = build_number if self.completed: if len(collected_builds) < self.num and _['state'] in ( 'passed', 'finished', 'failed', 'errored'): collected_builds.append(_) elif self.failed: if _['state'] == 'passed': if not collected_builds and not found_newer_passing_build: log.warning("found more recent successful build #%s with state = '%s'" + \ ", you may not need to debug this build any more", _['number'], _['state']) found_newer_passing_build = True elif _['state'] in ('failed', 'errored'): if len(collected_builds) < self.num: collected_builds.append(_) # by continuing to iterate through the rest of the builds we can check # their last_build numbers are descending for extra sanity checking #break elif len(collected_builds) < self.num: collected_builds.append(_) # by continuing to iterate through the rest of the builds we can check # their last_build numbers are descending for extra sanity checking #break if not collected_builds: qquit('UNKNOWN', 'no recent builds found') if log.isEnabledFor(logging.DEBUG): for build in collected_builds: log.debug("build:\n%s", jsonpp(build)) return collected_builds
def print_results(self, term, limit=None): data = self.search(term, limit) results = {} longest_name = 8 try: # collect in dict to order by stars like normal docker search command for item in data['results']: star = item['star_count'] name = item['name'] if len(name) > longest_name: longest_name = len(name) if not isInt(star): die("star count '{0}' for repo '{1}' is not an integer! {2}" .format(star, name, support_msg_api())) results[star] = results.get(star, {}) results[star][name] = results[star].get(name, {}) result = {} result['description'] = item['description'] result['official'] = '[OK]' if item['is_official'] else '' # docker search doesn't output this so neither will I #result['trusted'] = result['is_trusted'] result['automated'] = '[OK]' if item['is_automated'] else '' results[star][name] = result # mimicking out spacing from 'docker search' command if not self.quiet: print('{0:{5}s} {1:45s} {2:7s} {3:8s} {4:10s}'.format( 'NAME', 'DESCRIPTION', 'STARS', 'OFFICIAL', 'AUTOMATED', longest_name)) except KeyError as _: die('failed to parse results fields from data returned by DockerHub ' + '(format may have changed?): {0}'.format(_)) except IOError as _: if str(_) == '[Errno 32] Broken pipe': pass else: raise def truncate(mystr, length): if len(mystr) > length: mystr = mystr[0:length - 3] + '...' return mystr for star in reversed(sorted(results)): for name in sorted(results[star]): if self.quiet: print(name.encode('utf-8')) else: desc = truncate(results[star][name]['description'], 45) print('{0:{5}s} {1:45s} {2:<7d} {3:8s} {4:10s}'. format(name.encode('utf-8'), desc.encode('utf-8'), star, results[star][name]['official'], results[star][name]['automated'], longest_name)) if self.verbose and not self.quiet: try: print('\nResults Shown: {0}\nTotal Results: {1}'.format( len(data['results']), data['num_results'])) except KeyError as _: die('failed to parse get total results count from data returned by DockerHub ' + '(format may have changed?): {0}'.format(_))
def __parse_args__(self): try: (self.options, self.args) = self.__parser.parse_args() # I don't agree with zero exit code from OptionParser for help/usage, # and want UNKNOWN not CRITICAL(2) for switch mis-usage... except SystemExit: # pragma: no cover sys.exit(ERRORS['UNKNOWN']) if self.options.help: # pragma: no cover self.usage() if self.options.version: # pragma: no cover print('%(version)s' % self.__dict__) sys.exit(ERRORS['UNKNOWN']) if 'timeout' in dir(self.options): self.timeout = self.get_opt('timeout') env_verbose = os.getenv('VERBOSE') if isInt(env_verbose): if env_verbose > self.verbose: log.debug('environment variable $VERBOSE = %s, increasing verbosity', env_verbose) self.verbose = env_verbose elif env_verbose is None: pass else: log.warn("$VERBOSE environment variable is not an integer ('%s')", env_verbose) self.parse_args() return self.options, self.args
def parse(json_data): try: # it's already nicely layed out #if log.isEnabledFor(logging.DEBUG): # log.debug('%s', jsonpp(json_data)) compaction_queue_size = None for bean in json_data['beans']: if bean['name'] == 'Hadoop:service=HBase,name=RegionServer,sub=Server': if log.isEnabledFor(logging.DEBUG): log.debug('found RegionServer section:') log.debug('%s', jsonpp(bean)) compaction_queue_size = bean['compactionQueueLength'] if not isInt(compaction_queue_size): qquit( 'UNKNOWN', 'non-integer returned for compactionQueueLength! ' + support_msg_api()) return compaction_queue_size except KeyError as _: qquit( 'UNKNOWN', _ + ': failed to parse HBase Master jmx info. ' + support_msg_api()) qquit( 'UNKNOWN', 'RegionServer mbean not found, double check this is pointing to an HBase RegionServer' )
def parse_output(self, content): soup = BeautifulSoup(content, 'html.parser') if log.isEnabledFor(logging.DEBUG): log.debug("BeautifulSoup prettified:\n{0}\n{1}".format( soup.prettify(), '=' * 80)) # shorter to just catch NoneType attribute error when tag not found and returns None try: basestats = soup.find('div', {'id': 'tab_baseStats'}) table = basestats.find('table') #for table in basestats: rows = table.findAll('tr') headers = rows[0].findAll('th') header_server = headers[0].get_text() header_regions = headers[3].get_text() wider_table = len(headers) > 4 # HBase 1.1 in HDP 2.3: ServerName | Start time | Requests Per Second | Num. Regions # HBase 1.2 (Apache): ServerName | Start time | Version | Requests per Second | Num. Regions if wider_table: header_regions = headers[4].get_text() if header_server != 'ServerName': qquit( 'UNKNOWN', "Table headers in Master UI have changed" + " (got {0}, expected 'ServerName'). ".format(header_server) + support_msg()) if header_regions != 'Num. Regions': qquit( 'UNKNOWN', "Table headers in Master UI have changed" + " (got {0}, expected 'Num. Regions'). ".format( header_regions) + support_msg()) log.debug('%-50s\tnum_regions', 'server') for row in rows[1:]: # this can be something like: # 21689588ba40,16201,1473775984259 # so don't apply isHost() validation because it'll fail FQDN / IP address checks cols = row.findAll('td') server = cols[0].get_text() if self.total_regex.match(server): continue num_regions = cols[3].get_text() if wider_table: num_regions = cols[4].get_text() if not isInt(num_regions): qquit( 'UNKNOWN', "parsing error - got '{0}' for num regions".format( num_regions) + " for server '{1}', was expecting integer.".format( server) + " UI format must have changed" + support_msg()) num_regions = int(num_regions) log.debug('%-50s\t%s', server, num_regions) if self.server_min_regions[ 1] is None or num_regions < self.server_min_regions[1]: self.server_min_regions = (server, num_regions) if self.server_max_regions[ 1] is None or num_regions > self.server_max_regions[1]: self.server_max_regions = (server, num_regions) except (AttributeError, TypeError, IndexError): qquit('UNKNOWN', 'failed to find parse output')
def parse_json(self, json_data): sms_credits = json_data['credits']['availablesms'] if not isInt(sms_credits): raise UnknownError('Pingdom API returned non-integer for availablesms field') self.msg = 'Pingdom SMS credits available: {}'.format(sms_credits) self.check_thresholds(sms_credits) self.msg += ' | sms_credits={}'.format(sms_credits) self.msg += self.get_perf_thresholds(boundary='lower')
def parse_json(self, json_data): num_executors = json_data['numExecutors'] if not isInt(num_executors): raise UnknownError('non-integer returned by Jenkins. {0}'.format(support_msg_api())) self.msg += '{:d}'.format(num_executors) self.check_thresholds(num_executors) self.msg += ' | num_executors={0:d}'.format(num_executors) self.msg += self.get_perf_thresholds(boundary='lower')
def parse_json(self, json_data): if self.list_jobs: print('Jenkins Jobs:\n') for job in json_data['jobs']: print(job['name']) sys.exit(ERRORS['UNKNOWN']) if 'lastCompletedBuild' in json_data: last_completed_build = json_data['lastCompletedBuild'] if not last_completed_build: raise WarningError("job '{job}' not built yet".format(job=self.job)) self.path = '/job/{job}/{number}/api/json'.format(job=self.job, number=last_completed_build['number']) req = self.query() self.process_json(req.content) return displayname = json_data['displayName'] duration = json_data['duration'] if not isInt(duration): raise UnknownError('duration field returned non-integer! {0}'.format(support_msg_api())) duration = int(duration) / 1000 result = json_data['result'] timestamp = json_data['timestamp'] if not isInt(timestamp): raise UnknownError('timestamp field returned non-integer! {0}'.format(support_msg_api())) timestamp = int(timestamp) building = json_data['building'] self.msg += "build {build} status: ".format(build=displayname) if building: self.unknown() self.msg += 'STILL BUILDING!' return self.msg += result if result != 'SUCCESS': self.critical() self.msg += ', duration={duration} secs'.format(duration=duration) self.check_thresholds(duration) age = time.time() - (timestamp/1000) self.msg += ', age={age} secs'.format(age=sec2human(age)) if age < 0: self.warning() self.msg += ' (< 0!)' if self.age and age > self.age: self.critical() self.msg += ' (> {0:d})'.format(self.age) self.msg += ' | build_duration={duration}s{perf_thresholds}'.format(duration=duration, perf_thresholds=self.get_perf_thresholds())
def mac_getent_passwd_user(self, user): log.info('mac_getent_passwd_user(%s)', user) command = 'dscl . -read /Users/{user}'.format(user=user) (output, returncode) = self.cmd(command) user = password = uid = gid = name = homedir = shell = '' #log.info('parsing output for passwd conversion') output = output.split('\n') for (index, line) in enumerate(output): tokens = line.split() if len(tokens) < 1: continue field = tokens[0] if len(tokens) < 2: value = '' else: value = tokens[1] if field == 'RecordName:': user = value elif field == 'Password:'******'x' elif field == 'UniqueID:': uid = value elif field == 'PrimaryGroupID:': gid = value elif field == 'RealName:': name = value if not value and len(output) > index + 1 and output[index+1].startswith(' '): name = output[index+1].strip() elif not name and field == 'RecordName:': name = value elif field == 'NFSHomeDirectory:': homedir = value elif field == 'UserShell:': shell = value if not user: return('', returncode) getent_record = '{user}:{password}:{uid}:{gid}:{name}:hotexamples_com:{shell}'.format\ (user=user, password=password, uid=uid, gid=gid, name=name, homedir=homedir, shell=shell) if not isInt(uid, allow_negative=True): die("parsing error: UID '{uid}' is not numeric in record {record}!".format(uid=uid, record=getent_record)) if not isInt(gid, allow_negative=True): die("parsing error: GID '{gid}' is not numeric in record {record}!".format(gid=gid, record=getent_record)) return (getent_record, returncode)
def check_ping(host, count=None, wait=None): if count is None: count = 1 if wait is None: wait = 3 if not isInt(count): raise UnknownError("passed invalid count '{0}' to check_ping method, must be a valid integer!"\ .format(count)) if not isInt(wait): raise UnknownError("passed invalid wait '{0}' to check_ping method, must be a valid integer!"\ .format(wait)) log.info("pinging host '%s' (count=%s, wait=%s)", host, count, wait) count_switch = '-c' if platform.system().lower() == 'windows': count_switch = '-n' wait_switch = '-w' if platform.system().lower() == 'darwin': wait_switch = '-W' # causes hang if count / wait are not cast to string cmd = [ 'ping', count_switch, '{0}'.format(count), wait_switch, '{0}'.format(wait), host ] log.debug('cmd: %s', ' '.join(cmd)) #log.debug('args: %s', cmd) try: process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) #log.debug('communicating') (stdout, stderr) = process.communicate() #log.debug('waiting for child process') process.wait() exitcode = process.returncode log.debug('stdout: %s', stdout) log.debug('stderr: %s', stderr) log.debug('exitcode: %s', exitcode) if exitcode == 0: log.info("host '%s' responded to ping", host) return host except subprocess.CalledProcessError as _: log.warn('ping failed: %s', _.output) except OSError as _: die('error calling ping: {0}'.format(_)) return None
def process_result(self, result): _id = result['id'] log.info('latest build id: %s', _id) status = result['status'] log.info('status: %s', status) if not isInt(status, allow_negative=True): raise UnknownError( 'non-integer status returned by DockerHub API. {0}'.format( support_msg_api())) tag = result['dockertag_name'] log.info('tag: %s', tag) trigger = result['cause'] log.info('trigger: %s', trigger) created_date = result['created_date'] log.info('created date: %s', created_date) last_updated = result['last_updated'] log.info('last updated: %s', last_updated) created_datetime = datetime.datetime.strptime( created_date.split('.')[0], '%Y-%m-%dT%H:%M:%S') updated_datetime = datetime.datetime.strptime( last_updated.split('.')[0], '%Y-%m-%dT%H:%M:%S') build_latency_timedelta = updated_datetime - created_datetime build_latency = build_latency_timedelta.total_seconds() log.info('build latency (creation to last updated): %s', build_latency) # results in .0 floats anyway build_latency = int(build_latency) build_code = result['build_code'] build_url = 'https://hub.docker.com/r/{0}/builds/{1}'.format( self.repo, build_code) log.info('latest build URL: %s', build_url) if str(status) in self.statuses: status = self.statuses[str(status)] else: log.warning("status code '%s' not recognized! %s", status, support_msg_api()) log.warning('defaulting to assume status is an Error') status = 'Error' if status != 'Success': self.critical() self.msg += "'{repo}' last completed build status: '{status}', tag: '{tag}', build code: {build_code}"\ .format(repo=self.repo, status=status, tag=tag, build_code=build_code) if self.verbose: self.msg += ', id: {0}'.format(_id) self.msg += ', trigger: {0}'.format(trigger) self.msg += ', created date: {0}'.format(created_date) self.msg += ', last updated: {0}'.format(last_updated) self.msg += ', build_latency: {0}'.format(sec2human(build_latency)) self.msg += ', build URL: {0}'.format(build_url) self.msg += ' | build_latency={0:d}s'.format(build_latency)
def parse_json(self, json_data): stats_db_event_queue = json_data['statistics_db_event_queue'] if not isInt(stats_db_event_queue): raise UnknownError("non-integer stats db event queue returned ('{0}'). {1}"\ .format(stats_db_event_queue, support_msg_api())) stats_db_event_queue = int(stats_db_event_queue) self.msg = "{0} stats dbs event queue = {1}".format(self.name, stats_db_event_queue) self.check_thresholds(stats_db_event_queue) self.msg += " | stats_db_event_queue={0}".format(stats_db_event_queue) self.msg += self.get_perf_thresholds()
def timeout_default(self, secs): if secs is not None: if not isInt(secs): raise CodingError('invalid timeout passed to timeout_default = , must be an integer representing seconds') # pylint: disable=line-too-long # validate_int(secs, 'timeout default', 0, self.__timeout_max ) if self.timeout_max is not None and secs > self.timeout_max: raise CodingError('set default timeout > timeout max') secs = int(secs) log.debug('setting default timeout to %s secs', secs) self.__timeout_default = secs
def __init__(self): # instance attributes, feels safer self.name = None self.options = None self.args = None self.__verbose = None self.__verbose_default = 0 self.__timeout = None self.__timeout_default = 10 self.__timeout_max = 86400 self.__total_run_time = time.time() self.topfile = get_topfile() self._docstring = get_file_docstring(self.topfile) if self._docstring: self._docstring = '\n' + self._docstring.strip() + '\n' if self._docstring is None: self._docstring = '' self._topfile_version = get_file_version(self.topfile) # this doesn't work in unit tests # if self._topfile_version: # raise CodingError('failed to get topfile version - did you set a __version__ in top cli program?') # pylint: disable=line-too-long self._cli_version = self.__version__ self._utils_version = harisekhon.utils.__version__ # returns 'python -m unittest' :-/ # prog = os.path.basename(sys.argv[0]) self._prog = os.path.basename(self.topfile) self._github_repo = get_file_github_repo(self.topfile) # _hidden attributes are shown in __dict__ self.version = '{prog} version {topfile_version} '.format(prog=self._prog, topfile_version=self._topfile_version) + \ '=> CLI version {cli_version} '.format(cli_version=self._cli_version) + \ '=> Utils version {utils_version}'.format(utils_version=self._utils_version) self.usagemsg = 'Hari Sekhon{sep}{github_repo}\n\n{prog}\n{docstring}\n'.format(\ sep=' - ' if self._github_repo else '', github_repo=self._github_repo, prog=self._prog, docstring=self._docstring) self.usagemsg_short = 'Hari Sekhon%(_github_repo)s\n\n' % self.__dict__ # set this in simpler client programs when you don't want to exclude # self.__parser = OptionParser(usage=self.usagemsg_short, version=self.version) # self.__parser = OptionParser(version=self.version) # will be added by default_opts later so that it's not annoyingly at the top of the option help # also this allows us to print full docstring for a complete description and not just the cli switches # description=self._docstring # don't want description printed for option errors width = os.getenv('COLUMNS', None) if not isInt(width) or not width: try: width = Terminal().width except _curses.error: width = 80 width = min(width, 200) self.__parser = OptionParser(add_help_option=False, formatter=IndentedHelpFormatter(width=width)) # duplicate key error or duplicate options, sucks # self.__parser.add_option('-V', dest='version', help='Show version and exit', action='store_true') self.setup()
def parse_json(self, json_data): gcs = json_data['systemDiagnostics']['aggregateSnapshot']['garbageCollection'] gc_millis = max([_['collectionMillis'] for _ in gcs]) if not isInt(gc_millis): raise CriticalError('collectionMillis \'{}\' is not an integer!!'.format(gc_millis)) gc_millis = int(gc_millis) gc_secs = '{:.2f}'.format(gc_millis / 1000) self.ok() self.msg = 'Nifi Java GC last collection time = {} secs'.format(gc_secs) self.check_thresholds(gc_secs) self.msg += ' | gc_collection={}s{}'.format(gc_secs, self.get_perf_thresholds())
def parse_json(self, json_data): stats_db_event_queue = json_data['statistics_db_event_queue'] if not isInt(stats_db_event_queue): raise UnknownError("non-integer stats db event queue returned ('{0}'). {1}"\ .format(stats_db_event_queue, support_msg_api())) stats_db_event_queue = int(stats_db_event_queue) self.msg = "{0} stats dbs event queue = {1}".format( self.name, stats_db_event_queue) self.check_thresholds(stats_db_event_queue) self.msg += " | stats_db_event_queue={0}".format(stats_db_event_queue) self.msg += self.get_perf_thresholds()
def parse_json(self, json_data): processors = json_data['systemDiagnostics']['aggregateSnapshot']['availableProcessors'] if not isInt(processors): raise CriticalError('availableProcessors \'{}\' is not an integer!!'.format(processors)) processors = int(processors) if processors > 0: self.ok() self.msg = 'Nifi status = OK, processors available' else: self.critical() self.msg = 'Nifi status = CRITICAL, no processors available'
def check_app(self, app): state = app['state'] user = app['user'] queue = app['queue'] # Hadoop 2.2 doesn't have this field running_containers = None if 'runningContainers' in app: running_containers = app['runningContainers'] if not isInt(running_containers, allow_negative=True): raise UnknownError('running_containers {} is not an integer!'.format(running_containers)) running_containers = int(running_containers) elapsed_time = app['elapsedTime'] if not isInt(elapsed_time): raise UnknownError('elapsed time {} is not an integer'.format(elapsed_time)) elapsed_time = int(elapsed_time / 1000) self.msg = "Yarn application '{0}' state = '{1}'".format(app['name'], state) if state != 'RUNNING': self.critical() ################## # This shouldn't be used any more now using more targeted query to only return running apps # state = FAILED / KILLED also gets final status = FAILED KILLED, no point double printing if state == 'FINISHED': self.msg += ", final status = '{0}'".format(app['finalStatus']) ################## self.msg += ", user = '******'".format(user) if self.app_user is not None and self.app_user != user: self.critical() self.msg += " (expected '{0}')".format(self.app_user) self.msg += ", queue = '{0}'".format(queue) if self.queue is not None and self.queue != queue: self.critical() self.msg += " (expected '{0}')".format(self.queue) if running_containers is not None: self.msg += ", running containers = {0}".format(running_containers) if self.min_containers is not None and running_containers < self.min_containers: self.critical() self.msg += " (< '{0}')".format(self.min_containers) self.msg += ", elapsed time = {0} secs".format(elapsed_time) self.check_thresholds(elapsed_time) return elapsed_time
def parse_json(self, json_data): log.info('parsing response') try: live_nodes = json_data['beans'][0]['LiveNodes'] live_node_data = json.loads(live_nodes) num_datanodes = len(live_node_data) if num_datanodes < 1: raise CriticalError("no live datanodes returned by JMX API from namenode '{0}:{1}'"\ .format(self.host, self.port)) min_space = None max_space = 0 for datanode in live_node_data: used_space = live_node_data[datanode]['usedSpace'] if not isInt(used_space): raise UnknownError('usedSpace {} is not an integer! {}'.format(used_space, support_msg_api())) used_space = int(used_space) log.info("datanode '%s' used space = %s", datanode, used_space) if min_space is None or used_space < min_space: min_space = used_space if used_space > max_space: max_space = used_space divisor = max_space if divisor < 1: log.info('min used space < 1, resetting divisor to 1 (% will likely be very high)') divisor = 1 if max_space < min_space: raise UnknownError('max_space < min_space') largest_imbalance_pc = float('{0:.2f}'.format(((max_space - min_space) / divisor) * 100)) if largest_imbalance_pc < 0: raise UnknownError('largest_imbalance_pc < 0') self.ok() self.msg = '{0}% HDFS imbalance on space used'.format(largest_imbalance_pc) self.check_thresholds(largest_imbalance_pc) self.msg += ' across {0:d} datanode{1}'.format(num_datanodes, plural(num_datanodes)) if self.verbose: self.msg += ', min used space = {0}, max used space = {1}'.format(min_space, max_space) if self.verbose and (self.is_warning() or self.is_critical()): self.msg += ' [imbalanced nodes: ' for datanode in live_node_data: used_space = live_node_data[datanode]['usedSpace'] if (used_space / max_space * 100) > self.thresholds['warning']['upper']: self.msg += '{0}({1:.2f%}),'.format(datanode, used_space) self.msg = self.msg.rstrip(',') + ']' self.msg += " | 'HDFS imbalance on space used %'={0}".format(largest_imbalance_pc) self.msg += self.get_perf_thresholds() self.msg += " num_datanodes={0}".format(num_datanodes) self.msg += " min_used_space={0}".format(min_space) self.msg += " max_used_space={0}".format(max_space) except KeyError as _: raise UnknownError("failed to parse json returned by NameNode at '{0}:{1}': {2}. {3}"\ .format(self.host, self.port, _, support_msg_api()))
def process_result(self, result): _id = result['id'] log.info('latest build id: %s', _id) status = result['status'] log.info('status: %s', status) if not isInt(status, allow_negative=True): raise UnknownError('non-integer status returned by DockerHub API. {0}'.format(support_msg_api())) tag = result['dockertag_name'] log.info('tag: %s', tag) trigger = result['cause'] log.info('trigger: %s', trigger) created_date = result['created_date'] log.info('created date: %s', created_date) last_updated = result['last_updated'] log.info('last updated: %s', last_updated) created_datetime = datetime.datetime.strptime(created_date.split('.')[0], '%Y-%m-%dT%H:%M:%S') updated_datetime = datetime.datetime.strptime(last_updated.split('.')[0], '%Y-%m-%dT%H:%M:%S') build_latency_timedelta = updated_datetime - created_datetime build_latency = build_latency_timedelta.total_seconds() log.info('build latency (creation to last updated): %s', build_latency) # results in .0 floats anyway build_latency = int(build_latency) build_code = result['build_code'] build_url = 'https://hub.docker.com/r/{0}/builds/{1}'.format(self.repo, build_code) log.info('latest build URL: %s', build_url) if str(status) in self.statuses: status = self.statuses[str(status)] else: log.warning("status code '%s' not recognized! %s", status, support_msg_api()) log.warning('defaulting to assume status is an Error') status = 'Error' if status != 'Success': self.critical() self.msg += "'{repo}' last completed build status: '{status}', tag: '{tag}', build code: {build_code}"\ .format(repo=self.repo, status=status, tag=tag, build_code=build_code) if self.verbose: self.msg += ', id: {0}'.format(_id) self.msg += ', trigger: {0}'.format(trigger) self.msg += ', created date: {0}'.format(created_date) self.msg += ', last updated: {0}'.format(last_updated) self.msg += ', build_latency: {0}'.format(sec2human(build_latency)) self.msg += ', build URL: {0}'.format(build_url) self.msg += ' | build_latency={0:d}s'.format(build_latency)
def get_request_ids(self): content = self.get('/clusters/{cluster}/requests'.format(cluster=self.cluster)) try: _ = json.loads(content) request_ids = [] for item in _['items']: if item['Requests']['cluster_name'] == self.cluster: request_id = item['Requests']['id'] if not isInt(request_id): die('request id returned was not an integer! ' + support_msg_api()) request_ids.append(request_id) return request_ids except (KeyError, ValueError) as _: die('failed to parse response for request IDs: {0}. '.format(_) + support_msg_api())
def parse_json(self, json_data): log.info('parsing response') try: live_nodes_str = json_data['beans'][0]['LiveNodes'] dead_nodes_str = json_data['beans'][0]['DeadNodes'] decom_nodes_str = json_data['beans'][0]['DecomNodes'] live_nodes = json.loads(live_nodes_str) dead_nodes = json.loads(dead_nodes_str) decom_nodes = json.loads(decom_nodes_str) self.print_nodes(live_nodes=live_nodes, dead_nodes=dead_nodes, decom_nodes=decom_nodes) last_contact_secs = None for item in live_nodes: if self.match_datanode(self.datanode, item): last_contact_secs = live_nodes[item]['lastContact'] # always check decom and dead nodes regardless if last_contact_secs was found in live nodes # gives an additional safety check to escalate to warning / critical self.msg = '' for item in decom_nodes: if self.match_datanode(self.datanode, item): last_contact_secs = decom_nodes[item]['lastContact'] self.warning() self.msg = 'Decommissioning ' for item in dead_nodes: if self.match_datanode(self.datanode, item): last_contact_secs = dead_nodes[item]['lastContact'] self.critical() self.msg = 'Dead ' if last_contact_secs is None: raise UnknownError("datanode '{0}' is not present in any of the live, ".format(self.datanode) + \ "decommissioning or dead node lists!") if not isInt(last_contact_secs): raise UnknownError("non-integer '{0}' returned for last contact seconds by namenode '{1}:{2}'"\ .format(last_contact_secs, self.host, self.port)) last_contact_secs = int(last_contact_secs) if last_contact_secs < 0: raise UnknownError('last_contact_secs {} < 0!'.format(last_contact_secs)) self.msg += "HDFS datanode '{0}' last contact with namenode was {1} sec{2} ago"\ .format(self.datanode, last_contact_secs, plural(last_contact_secs)) self.check_thresholds(last_contact_secs) self.msg += ' | datanode_last_contact_secs={0}'.format(last_contact_secs) self.msg += self.get_perf_thresholds() except KeyError as _: raise UnknownError("failed to parse json returned by NameNode at '{0}:{1}': {2}. {3}"\ .format(self.host, self.port, _, support_msg_api())) except ValueError as _: raise UnknownError("invalid json returned for LiveNodes by Namenode '{0}:{1}': {2}"\ .format(self.host, self.port, _))
def __init__(self): # Python 2.x super(HeadTail, self).__init__() # Python 3.x # super().__init__() self.default_num_lines = 10 # this is usually None unless you explicitly 'export LINES' lines_env_var = os.getenv('LINES') if lines_env_var and isInt(lines_env_var): self.default_num_lines = int(int(lines_env_var) / 2) - 1 self.num_lines = self.default_num_lines #self.sep = '...' self.sep = '-' * 80 self.docsep = '=' * 80 self.quiet = False
def parse_json(self, json_data): log.info('parsing response') try: data = json_data['beans'][0] total_blocks = data['TotalBlocks'] if not isInt(total_blocks): raise UnknownError('non-integer returned by NameNode for number of total blocks! {0}'\ .format(support_msg_api())) total_blocks = int(total_blocks) self.msg = 'HDFS Total Blocks = {0:d}'.format(total_blocks) self.check_thresholds(total_blocks) self.msg += ' | hdfs_total_blocks={0:d}{1}'.format(total_blocks, self.get_perf_thresholds()) except KeyError as _: raise UnknownError("failed to parse json returned by NameNode at '{0}:{1}': {2}. {3}"\ .format(self.host, self.port, _, support_msg_api()))
def parse_output(self, content): soup = BeautifulSoup(content, 'html.parser') if log.isEnabledFor(logging.DEBUG): log.debug("BeautifulSoup prettified:\n{0}\n{1}".format(soup.prettify(), '='*80)) # shorter to just catch NoneType attribute error when tag not found and returns None try: basestats = soup.find('div', {'id': 'tab_baseStats'}) table = basestats.find('table') #for table in basestats: rows = table.findAll('tr') headers = rows[0].findAll('th') header_server = headers[0].get_text() header_regions = headers[3].get_text() wider_table = len(headers) > 4 # HBase 1.1 in HDP 2.3: ServerName | Start time | Requests Per Second | Num. Regions # HBase 1.2 (Apache): ServerName | Start time | Version | Requests per Second | Num. Regions if wider_table: header_regions = headers[4].get_text() if header_server != 'ServerName': qquit('UNKNOWN', "Table headers in Master UI have changed" + " (got {0}, expected 'ServerName'). ".format(header_server) + support_msg()) if header_regions != 'Num. Regions': qquit('UNKNOWN', "Table headers in Master UI have changed" + " (got {0}, expected 'Num. Regions'). ".format(header_regions) + support_msg()) log.debug('%-50s\tnum_regions', 'server') for row in rows[1:]: # this can be something like: # 21689588ba40,16201,1473775984259 # so don't apply isHost() validation because it'll fail FQDN / IP address checks cols = row.findAll('td') server = cols[0].get_text() if self.total_regex.match(server): continue num_regions = cols[3].get_text() if wider_table: num_regions = cols[4].get_text() if not isInt(num_regions): qquit('UNKNOWN', "parsing error - got '{0}' for num regions".format(num_regions) + " for server '{1}', was expecting integer.".format(server) + " UI format must have changed" + support_msg()) num_regions = int(num_regions) log.debug('%-50s\t%s', server, num_regions) if self.server_min_regions[1] is None or num_regions < self.server_min_regions[1]: self.server_min_regions = (server, num_regions) if self.server_max_regions[1] is None or num_regions > self.server_max_regions[1]: self.server_max_regions = (server, num_regions) except (AttributeError, TypeError, IndexError): qquit('UNKNOWN', 'failed to find parse output')
def parse_json(self, json_data): gc_times = [] for bean in json_data['beans']: if 'name' in bean and bean['name'][:37] == 'java.lang:type=GarbageCollector,name=': last_gc_info = bean['LastGcInfo'] if last_gc_info and 'duration' in last_gc_info and isInt(last_gc_info['duration']): gc_times.append(int(last_gc_info['duration'])) if not gc_times: raise UnknownError('no Java GC times found') gc_millis = max(gc_times) gc_millis = int(gc_millis) gc_secs = '{:.2f}'.format(gc_millis / 1000) self.ok() self.msg = '{} Java GC last duration = {} secs'.format(self.name[0], gc_secs) self.check_thresholds(gc_secs) self.msg += ' | gc_duration={}s{}'.format(gc_secs, self.get_perf_thresholds())
def parse_json(self, json_data): log.info('parsing response') try: live_nodes = json_data['beans'][0]['LiveNodes'] live_node_data = json.loads(live_nodes) num_datanodes = len(live_node_data) if num_datanodes < 1: raise CriticalError("no live datanodes returned by JMX API from namenode '{0}:{1}'"\ .format(self.host, self.port)) max_blocks = 0 min_blocks = None for datanode in live_node_data: blocks = live_node_data[datanode]['numBlocks'] if not isInt(blocks): raise UnknownError('numBlocks {} is not an integer! {}'.format(blocks, support_msg_api())) blocks = int(blocks) log.info("datanode '%s' has %s blocks", datanode, blocks) if blocks > max_blocks: max_blocks = blocks if min_blocks is None or blocks < min_blocks: min_blocks = blocks log.info("max blocks on a single datanode = %s", max_blocks) log.info("min blocks on a single datanode = %s", min_blocks) if min_blocks is None: raise UnknownError('min_blocks is None') divisor = min_blocks if min_blocks < 1: log.info("min blocks < 1, resetting divisor to 1 (% will be very high)") divisor = 1 block_imbalance = float("{0:.2f}".format((max_blocks - min_blocks) / divisor * 100)) self.msg = '{0}% block imbalance across {1} datanode{2}'\ .format(block_imbalance, num_datanodes, plural(num_datanodes)) self.ok() self.check_thresholds(block_imbalance) if self.verbose: self.msg += ' (min blocks = {0}, max blocks = {1})'.format(min_blocks, max_blocks) self.msg += " | block_imbalance={0}%".format(block_imbalance) self.msg += self.get_perf_thresholds() self.msg += " num_datanodes={0}".format(num_datanodes) self.msg += " min_blocks={0}".format(min_blocks) self.msg += " max_blocks={0}".format(max_blocks) except KeyError as _: raise UnknownError("failed to parse json returned by NameNode at '{0}:{1}': {2}. {3}"\ .format(self.host, self.port, _, support_msg_api())) except ValueError as _: raise UnknownError("invalid json returned for LiveNodes by Namenode '{0}:{1}': {2}"\ .format(self.host, self.port, _))
def run(self): server_url = '{proto}://{host}:{port}'.format(proto=self.protocol, host=self.host, port=self.port) try: log.debug('setting up Jenkins connection to %s', server_url) start_time = time.time() server = jenkins.Jenkins(server_url, username=self.user, password=self.password, timeout=self.timeout / 3) if log.isEnabledFor(logging.DEBUG): log.debug('getting user') user = server.get_whoami() log.debug('connected as user %s', jsonpp(user)) if self.list_nodes: log.debug('getting Jenkins nodes') nodes = server.get_nodes() log.debug('nodes: %s', nodes) print('Jenkins nodes:\n') for _ in nodes: print(_['name']) sys.exit(ERRORS['UNKNOWN']) # doesn't find 'master' node despite showing it in the list of nodes, jenkins puts brackets around master if self.node == 'master': self.node = '(master)' node = server.get_node_info(self.node) except jenkins.NotFoundException: raise CriticalError("node '{0}' not found, did you specify the correct name? See --list to see nodes"\ .format(self.node)) except jenkins.JenkinsException as _: raise CriticalError(_) query_time = time.time() - start_time if log.isEnabledFor(logging.DEBUG): log.debug('%s', jsonpp(node)) offline = node['offline'] offline_reason = node['offlineCauseReason'] num_executors = node['numExecutors'] num_executors = int(num_executors) if not isInt(num_executors): raise UnknownError('numExecutors returned non-integer! {0}'.format(support_msg_api())) if offline: self.critical() self.msg += 'offline: {0}'.format(offline_reason) else: self.msg += 'online' self.msg += ', num executors = {0}'.format(num_executors) self.check_thresholds(num_executors) self.msg += ' | num_executors={0:d}'.format(num_executors) self.msg += self.get_perf_thresholds(boundary='lower') self.msg += ' query_time={0:.4f}s'.format(query_time)
def parse(content): try: _ = json.loads(content) if log.isEnabledFor(logging.DEBUG): log.debug(jsonpp(_)) compaction_queue_size = None for bean in _['beans']: if bean['name'] == 'Hadoop:service=HBase,name=RegionServer,sub=Server': if log.isEnabledFor(logging.DEBUG): log.debug('found RegionServer section:') log.debug(jsonpp(bean)) compaction_queue_size = bean['compactionQueueLength'] if not isInt(compaction_queue_size): qquit('UNKNOWN', 'non-integer returned for compactionQueueLength! ' + support_msg_api()) return compaction_queue_size except ValueError as _: qquit('UNKNOWN', _ + ': failed to parse HBase Master jmx info. ' + support_msg_api()) qquit('UNKNOWN', 'RegionServer mbean not found, double check this is pointing to an HBase RegionServer')