def output(self, connect_time, total_time): self.msg = "HBase write spray to {0} column {1} x {2} region{3}".format(self.num_column_families, 'families' if \ plural(self.num_column_families) \ else 'family', self.num_regions, plural(self.num_regions)) precision = self.precision self.msg += " total_time={0:0.{precision}f}ms".format(total_time, precision=precision) self.msg += " connect_time={connect_time:0.{precision}f}ms".format(connect_time=connect_time, precision=precision) perfdata = " | total_time={total_time:0.{precision}f}ms connect_time={connect_time:0.{precision}f}ms"\ .format(total_time=total_time, connect_time=connect_time, precision=precision) self.msg += ", max timings: column family " for cf_qf in self.timings: column = cf_qf.split(':', 2)[0] self.msg += "'{0}'".format(column) for action in ['write', 'read', 'delete']: query_time = self.timings[cf_qf][action] self.msg += " {0}_time={1:0.{precision}f}ms".format(action, query_time, precision=precision) self.check_thresholds(self.timings[cf_qf][action]) perfdata += " '{0}_max_{1}_time'={2:0.{precision}f}ms".format(column, action, query_time, precision=precision) perfdata += self.get_perf_thresholds() self.msg += ', ' self.msg = self.msg.rstrip(', ') self.msg += perfdata
def run(self): try: linux_only(' as it reads /proc/mounts for more reliable information than the mount command provides' + \ ', see --help description for more details') except LinuxOnlyException as _: raise UnknownError('LinuxOnlyException: {}'.format(_)) mount_lines = self.get_mounts() (num_read_only, num_checked, read_only) = self.parse_mounts(mount_lines) self.msg = '{} read only mount point{} out of {} mount point{} checked'\ .format(num_read_only, plural(num_read_only), num_checked, plural(num_checked)) if num_read_only == 0: self.ok() if num_checked == 0: self.warning() self.msg += ' (no matching mount points?)' if num_read_only > 0: self.critical() self.msg += '!' if self.verbose: from pprint import pprint pprint(read_only) if self.verbose > 1: _ = ['{}({})'.format(mount_point, _type) for mount_point, _type in read_only.iteritems()] else: _ = [mount_point for mount_point, _type in read_only.iteritems()] self.msg += ' [{}]'.format(', '.join(_)) self.msg += ' | read_only_mount_points={} mount_points_checked={}'.format(num_read_only, num_checked)
def parse_json(self, json_data): if self.list: print('Ambari Clusters:\n') for _ in json_data['items']: print(_['Clusters']['cluster_name']) sys.exit(ERRORS['UNKNOWN']) racks = {} for host in json_data['items']: host_name = host['Hosts']['host_name'] rack = host['Hosts']['rack_info'] if rack not in racks: racks[rack] = [] racks[rack].append(host_name) num_racks = len(racks) self.msg = '{} rack{} configured'.format(num_racks, plural(num_racks)) if num_racks < 2: self.warning() self.msg += ' (no rack resilience!)' default_rack = '/default-rack' num_nodes_left_in_default_rack = 0 if default_rack in racks: self.warning() num_nodes_left_in_default_rack = len(racks[default_rack]) msg = "{num} node{plural} left in '{default_rack}'!"\ .format(num=num_nodes_left_in_default_rack, plural=plural(num_nodes_left_in_default_rack), default_rack=default_rack) if self.verbose: msg += ' [{}]'.format(', '.join(racks[default_rack])) self.msg = msg + ' - ' + self.msg self.msg += ' | hdfs_racks={};2 nodes_in_default_rack={};0'\ .format(num_racks, num_nodes_left_in_default_rack)
def run(self): server_url = '{proto}://{host}:{port}'.format(proto=self.protocol, host=self.host, port=self.port) try: log.debug('setting up Jenkins connection to %s', server_url) start_time = time.time() server = jenkins.Jenkins(server_url, username=self.user, password=self.password, timeout=self.timeout / 3) if log.isEnabledFor(logging.DEBUG): log.debug('getting user') user = server.get_whoami() log.debug('connected as user %s', jsonpp(user)) log.debug('getting Jenkins nodes') nodes = server.get_nodes() log.debug('nodes: %s', nodes) node_count = len(nodes) log.debug('node count: %s', node_count) offline_nodes = 0 for node in nodes: if node['offline']: offline_nodes += 1 self.msg += '{0} offline node{1}'.format(offline_nodes, plural(offline_nodes)) self.check_thresholds(offline_nodes) self.msg += ' out of {0} node{1}'.format(node_count, plural(node_count)) except jenkins.JenkinsException as _: raise CriticalError(_) query_time = time.time() - start_time self.msg += ' | offline_nodes={0:d}'.format(offline_nodes) self.msg += self.get_perf_thresholds() self.msg += ' node_count={0:d}'.format(node_count) self.msg += ' query_time={0:.4f}s'.format(query_time)
def parse_json(self, json_data): log.info('parsing response') try: data = json_data['beans'][0] name_dir_statuses = data['NameDirStatuses'] name_dir_data = json.loads(name_dir_statuses) active_dirs = name_dir_data['active'] failed_dirs = name_dir_data['failed'] num_active_dirs = len(active_dirs) num_failed_dirs = len(failed_dirs) self.msg = 'NameNode has {0} failed dir{1}'.format( num_failed_dirs, plural(num_failed_dirs)) if num_failed_dirs > 0: self.warning() if self.verbose: self.msg += ' ({0})'.format(', '.join(failed_dirs)) self.msg += ', {0} active dir{1}'.format(num_active_dirs, plural(num_active_dirs)) if num_active_dirs < 1: self.critical() if self.verbose and num_active_dirs > 0: self.msg += ' ({0})'.format(', '.join(active_dirs)) self.msg += ' | num_failed_dirs={0} num_active_dirs={1}'.format( num_failed_dirs, num_active_dirs) except KeyError as _: raise UnknownError("failed to parse json returned by NameNode at '{0}:{1}': {2}. {3}"\ .format(self.host, self.port, _, support_msg_api())) except ValueError as _: raise UnknownError("invalid json returned for NameDirStatuses by Namenode '{0}:{1}': {2}"\ .format(self.host, self.port, _))
def check_times(self, start_date, end_date, max_age, max_runtime): try: start_datetime = datetime.strptime(start_date, '%m/%d/%Y %H:%M:%S') end_datetime = datetime.strptime(end_date, '%m/%d/%Y %H:%M:%S') except ValueError as _: qquit('UNKNOWN', 'error parsing date time format: {0}'.format(_)) runtime_delta = end_datetime - start_datetime self.msg += ' in {0}'.format(sec2human(runtime_delta.seconds)) if max_runtime is not None and max_runtime > (runtime_delta.seconds / 3600.0): self.warning() self.msg += ' (greater than {0} min{1}!)'.format('{0}'.format(max_runtime).rstrip('.0'), plural(max_runtime)) age_timedelta = datetime.now() - start_datetime if self.verbose: self.msg += ", start date = '{startdate}', end date = '{enddate}'".\ format(startdate=start_date, enddate=end_date) self.msg += ', started {0} ago'.format(sec2human(age_timedelta.seconds)) if max_age is not None and age_timedelta.seconds > (max_age * 60.0): self.warning() self.msg += ' (last run started more than {0} min{1} ago!)'.format('{0}'.format(max_age).rstrip('.0'), plural(max_age)) self.msg += ' |' self.msg += ' runtime={0}s;{1}'.format(runtime_delta.seconds, max_runtime * 3600 if max_runtime else '') self.msg += ' age={0}s;{1}'.format(age_timedelta.seconds, max_age * 3600 if max_age else '') self.msg += ' auth_time={auth_time}s query_time={query_time}s'.format(auth_time=self.auth_time, query_time=self.query_time)
def parse_json(self, json_data): log.info('parsing response') try: data = json_data['beans'][0] name_dir_statuses = data['NameDirStatuses'] name_dir_data = json.loads(name_dir_statuses) active_dirs = name_dir_data['active'] failed_dirs = name_dir_data['failed'] num_active_dirs = len(active_dirs) num_failed_dirs = len(failed_dirs) self.msg = 'NameNode has {0} failed dir{1}'.format(num_failed_dirs, plural(num_failed_dirs)) if num_failed_dirs > 0: self.warning() if self.verbose: self.msg += ' ({0})'.format(', '.join(failed_dirs)) self.msg += ', {0} active dir{1}'.format(num_active_dirs, plural(num_active_dirs)) if num_active_dirs < 1: self.critical() if self.verbose and num_active_dirs > 0: self.msg += ' ({0})'.format(', '.join(active_dirs)) self.msg += ' | num_failed_dirs={0} num_active_dirs={1}'.format(num_failed_dirs, num_active_dirs) except KeyError as _: raise UnknownError("failed to parse json returned by NameNode at '{0}:{1}': {2}. {3}"\ .format(self.host, self.port, _, support_msg_api())) except ValueError as _: raise UnknownError("invalid json returned for NameDirStatuses by Namenode '{0}:{1}': {2}"\ .format(self.host, self.port, _))
def run(self): try: linux_only(' as it reads /proc/mounts for more reliable information than the mount command provides' + \ ', see --help description for more details') except LinuxOnlyException as _: raise UnknownError('LinuxOnlyException: {}'.format(_)) mount_lines = self.get_mounts() (num_read_only, num_checked, read_only) = self.parse_mounts(mount_lines) self.msg = '{} read only mount point{} out of {} mount point{} checked'\ .format(num_read_only, plural(num_read_only), num_checked, plural(num_checked)) if num_read_only == 0: self.ok() if num_checked == 0: self.warning() self.msg += ' (no matching mount points?)' if num_read_only > 0: self.critical() self.msg += '!' if self.verbose: from pprint import pprint pprint(read_only) if self.verbose > 1: _ = [ '{}({})'.format(mount_point, _type) for mount_point, _type in read_only.iteritems() ] else: _ = [ mount_point for mount_point, _type in read_only.iteritems() ] self.msg += ' [{}]'.format(', '.join(_)) self.msg += ' | read_only_mount_points={} mount_points_checked={}'.format( num_read_only, num_checked)
def check_ingestion(self, num, filter_opts=None, max_age=None, max_runtime=None): log.info('checking ingestion history') json_dict = self.get_ingestions(num, filter_opts) info = '' if self.verbose: for key in sorted(filter_opts): info += " {0}='{1}'".format(key, filter_opts[key]) try: result = json_dict['result'] if not result: qquit('CRITICAL', "no results found for ingestion{0}"\ .format('{0}. {1}'.format(info, self.extract_response_message(json_dict)) + \ 'Perhaps you specified incorrect filters? Use --list to see existing ingestions')) num_results = len(result) log.info('%s ingestion history results returned', num_results) self.check_statuses(result) if num: self.msg += ' out of last {0} ingest{1}'.format( num_results, plural(num_results)) if self.history_mins: self.msg += ' within last {0} min{1}'.format( str(self.history_mins).rstrip('0').rstrip('.'), plural(self.history_mins)) longest_incomplete_timedelta = self.check_longest_incomplete_ingest( result, max_runtime) # newest is first # effectiveDate is null in testing (docs says it's a placeholder for future use) # using ingestionTimeFormatted instead, could also use ingestionTime which is timestamp in millis ingestion_date = result[0]['ingestionTimeFormatted'] age_timedelta = self.check_last_ingest_age( ingestion_date=ingestion_date, max_age=max_age) params_reference = [('inventoryId', 'id'), ('fileName', 'source'), ('destinationPath', 'dest')] if self.verbose and [ param for (param, _) in params_reference if param in filter_opts ]: self.msg += ' for' for (param, name) in params_reference: if param in filter_opts: self.msg += " {name}='{value}'".format( name=name, value=filter_opts[param]) self.msg += ' |' self.msg += ' last_ingest_age={0}s;{1}'.format( age_timedelta.seconds, max_age * 3600 if max_age else '') self.msg += ' longest_incomplete_ingest_age={0}s;{1}'.format(longest_incomplete_timedelta.seconds \ if longest_incomplete_timedelta else 0, max_age * 3600 if max_age else '') self.msg += ' auth_time={auth_time}s query_time={query_time}s'.format( auth_time=self.auth_time, query_time=self.query_time) except KeyError as _: qquit('UNKNOWN', 'error parsing workflow execution history: {0}'.format(_))
def check_table_regions(self): log.info('checking regions for table \'%s\'', self.table) regions = None try: table = self.conn.table(self.table) regions = table.regions() except HBaseIOError as _: #if 'org.apache.hadoop.hbase.TableNotFoundException' in _.message: if 'TableNotFoundException' in _.message: qquit('CRITICAL', 'table \'{0}\' does not exist'.format(self.table)) else: qquit('CRITICAL', _) except (socket.error, socket.timeout, ThriftException) as _: qquit('CRITICAL', _) if log.isEnabledFor(logging.DEBUG): log.debug('%s', jsonpp(regions)) if not regions: qquit('CRITICAL', 'failed to get regions for table \'{0}\''.format(self.table)) if not isList(regions): qquit('UNKNOWN', 'region info returned is not a list! ' + support_msg_api()) num_regions = len(regions) log.info('num regions: %s', num_regions) self.msg = 'HBase table \'{0}\' has {1} region{2}'.format( self.table, num_regions, plural(num_regions)) self.check_thresholds(num_regions) num_unassigned_regions = 0 for region in regions: try: if not region['server_name']: #log.debug('region \'%s\' is not assigned to any server', region['name']) num_unassigned_regions += 1 except KeyError as _: qquit( 'UNKNOWN', 'failed to find server assigned to region. ' + support_msg_api()) log.info('num unassigned regions: %s', num_unassigned_regions) self.msg += ', {0} unassigned region{1}'.format( num_unassigned_regions, plural(num_unassigned_regions)) if num_unassigned_regions > 0: self.warning() self.msg += '!' self.msg += ' |' self.msg += ' num_regions={0}'.format( num_regions) + self.get_perf_thresholds(boundary='lower') self.msg += ' num_unassigned_regions={0};1;0'.format( num_unassigned_regions) log.info('finished, closing connection') self.conn.close()
def check_times(self, start_date, end_date): start_date = str(start_date).strip() end_date = str(end_date).strip() invalid_dates = ('', 'null', 'None', None) age_timedelta = None runtime_delta = None if start_date not in invalid_dates and \ end_date not in invalid_dates: try: start_datetime = datetime.strptime(start_date, '%m/%d/%Y %H:%M:%S') end_datetime = datetime.strptime(end_date, '%m/%d/%Y %H:%M:%S') except ValueError as _: qquit('UNKNOWN', 'error parsing date time format: {0}'.format(_)) runtime_delta = end_datetime - start_datetime runtime_delta_secs = self.timedelta_seconds(runtime_delta) self.msg += ' in {0}'.format(sec2human(runtime_delta_secs)) if self.max_runtime is not None and (runtime_delta_secs / 60.0) > self.max_runtime: self.warning() self.msg += ' (greater than {0} min{1}!)'.format( str(self.max_runtime).rstrip('0').rstrip('.'), plural(self.max_runtime)) if self.min_runtime is not None and (runtime_delta_secs / 60.0) < self.min_runtime: self.warning() self.msg += ' (less than {0} min{1}!)'.format( str(self.min_runtime).rstrip('0').rstrip('.'), plural(self.min_runtime)) age_timedelta = datetime.now() - start_datetime age_timedelta_secs = self.timedelta_seconds(age_timedelta) if self.verbose: self.msg += ", start date = '{startdate}', end date = '{enddate}'".\ format(startdate=start_date, enddate=end_date) if age_timedelta is not None: self.msg += ', started {0} ago'.format( sec2human(age_timedelta_secs)) if self.max_age is not None and age_timedelta is not None \ and age_timedelta_secs > (self.max_age * 60.0): self.warning() self.msg += ' (last run started more than {0} min{1} ago!)'.format( str(self.max_age).rstrip('0').rstrip('.'), plural(self.max_age)) # Do not output variable number of fields at all if agedelta is not available as that breaks PNP4Nagios graphing if age_timedelta is not None and runtime_delta: self.msg += ' |' self.msg += ' runtime={0}s;{1}'.format(runtime_delta_secs, self.max_runtime * 60 \ if self.max_runtime else '') self.msg += ' age={0}s;{1}'.format( age_timedelta_secs, self.max_age * 60 if self.max_age else '') self.msg += ' auth_time={auth_time}s query_time={query_time}s'.format( auth_time=self.auth_time, query_time=self.query_time)
def parse_json(self, json_data): log.info('parsing response') try: live_nodes = json_data['beans'][0]['LiveNodes'] live_node_data = json.loads(live_nodes) num_datanodes = len(live_node_data) if num_datanodes < 1: raise UnknownError("no live datanodes returned by JMX API from namenode '{0}:{1}'"\ .format(self.host, self.port)) min_space = None max_space = 0 for datanode in live_node_data: used_space = live_node_data[datanode]['usedSpace'] if not isInt(used_space): raise UnknownError( 'usedSpace is not an integer! {0}'.format( support_msg_api())) used_space = int(used_space) log.info("datanode '%s' used space = %s", datanode, used_space) if min_space is None or used_space < min_space: min_space = used_space if used_space > max_space: max_space = used_space divisor = max_space if divisor < 1: log.info( 'min used space < 1, resetting divisor to 1 (% will likely be very high)' ) divisor = 1 assert max_space >= min_space largest_imbalance_pc = float('{0:.2f}'.format( ((max_space - min_space) / divisor) * 100)) assert largest_imbalance_pc >= 0 self.ok() self.msg = '{0}% HDFS imbalance on space used'.format( largest_imbalance_pc) self.check_thresholds(largest_imbalance_pc) self.msg += ' across {0:d} datanode{1}'.format( num_datanodes, plural(num_datanodes)) if self.verbose: self.msg += ', min used space = {0}, max used space = {1}'.format( min_space, max_space) if self.verbose and (self.is_warning() or self.is_critical()): self.msg += ' [imbalanced nodes: ' for datanode in live_node_data: used_space = live_node_data[datanode]['usedSpace'] if (used_space / max_space * 100) > self.thresholds['warning']['upper']: self.msg += '{0}({1:.2f%}),'.format( datanode, used_space) self.msg = self.msg.rstrip(',') + ']' self.msg += " | 'HDFS imbalance on space used %'={0}".format( largest_imbalance_pc) self.msg += self.get_perf_thresholds() self.msg += " num_datanodes={0}".format(num_datanodes) self.msg += " min_used_space={0}".format(min_space) self.msg += " max_used_space={0}".format(max_space) except KeyError as _: raise UnknownError("failed to parse json returned by NameNode at '{0}:{1}': {2}. {3}"\ .format(self.host, self.port, _, support_msg_api()))
def run(self): start_time = time.time() for page in range(1, self.max_pages + 1): url = 'https://registry.hub.docker.com/v2/repositories/{repo}/buildhistory?page={page}'\ .format(repo=self.repo, page=page) req = self.request.get(url) if log.isEnabledFor(logging.DEBUG): log.debug(jsonpp(req.content)) json_data = json.loads(req.content) log.debug('%s out of %s results returned for page %s', len(json_data['results']), json_data['count'], page) if self.process_results(json_data): # not quite as accurate as before as it now includes processing time but close enough query_time = time.time() - start_time if '|' not in self.msg: self.msg += ' |' self.msg += ' query_time={0:.2f}s'.format(query_time) return True extra_info = '' if self.verbose: extra_info = ' ({0} page{1} of API output)'\ .format(self.max_pages, plural(self.max_pages)) raise UnknownError( 'no completed builds found in last {0} builds{1}'.format( self.max_pages * 10, extra_info))
def print_stats(self, host): stats = self.stats show = self.show tstamp = time.strftime('%F %T') if not stats: print( "No table regions found for table '{}'. Did you specify the correct table name?" .format(self.table)) sys.exit(1) if self.first_iteration: log.info( 'first iteration or recent new region, skipping iteration until we have a differential' ) print('{}\t{} rate stats will be available in next iteration in {} sec{}'\ .format(tstamp, host, self.interval, plural(self.interval))) self.first_iteration = 0 return for table in sorted(stats[host]): for region in sorted(stats[host][table]): table_region = region if len(stats) > 1: table_region = '{}:{}'.format(table, region) # maintain explicit order for humans # rather than iterate keys of region which will some out in the wrong order for metric in ('read', 'write', 'total'): if (not show) or metric in show: print('{:20s}\t{:20s}\t{:40s}\t{:10s}\t{:8.0f}'\ .format(tstamp, host, table_region, metric, stats[host][table][region][metric])) print()
def process_bean(self, host, bean, uptime): region_regex = re.compile('^Namespace_{namespace}_table_{table}_region_(.+)_metric_(.+)RequestCount'\ .format(namespace=self.namespace, table=self.table)) first_iteration = 1 for key in sorted(bean): match = region_regex.match(key) if match: region = match.group(1) metric = match.group(2) #log.debug('match region %s %s request count', region, metric) if self.since_uptime: print('{:20s}\t{:20s}\t\t{:10s}\t{:8.0f}'.format( host, region, metric, bean[key] / uptime)) else: tstamp = time.strftime('%F %T') if region not in self.stats: self.stats[region] = {} if metric in self.stats[region]: print('{}\t{:20s}\t{:20s}\t\t{:10s}\t{:8.0f}'\ .format(tstamp, host, region, metric, bean[key] - self.stats[region][metric])) else: if first_iteration: print('{}\trate stats will be available in next iteration in {} sec{}'\ .format(tstamp, self.interval, plural(self.interval))) first_iteration = 0 self.stats[region][metric] = bean[key] print()
def msg_queue_stats(self, queue_stats): matching_queues = len(queue_stats['allowed']) + \ len(queue_stats['non-allowed']) + \ len(queue_stats['disallowed']) - 3 # account for 'total' in each dict self.msg += "{0} matching queue{1}".format(matching_queues, plural(matching_queues)) for _type in ('disallowed', 'non-allowed', 'allowed'): self.msg += ', {0} = {1}'.format(_type, queue_stats[_type]['total']) if self.verbose and matching_queues > 1: for queue in sorted(list(set(queue_stats['disallowed'].keys() + queue_stats['non-allowed'].keys() + queue_stats['allowed'].keys()))): if queue == 'total': continue for _type in ('disallowed', 'non-allowed', 'allowed'): self.msg += ', {0} {1} = {2}'.format(queue, _type, queue_stats[_type].get(queue, 0)) self.msg += ' |' for _type in ('disallowed', 'non-allowed', 'allowed'): self.msg += " '{0}'={1}".format(_type, queue_stats[_type]['total']) if self.verbose and matching_queues > 1: for queue in sorted(list(set(queue_stats['disallowed'].keys() + queue_stats['non-allowed'].keys() + queue_stats['allowed'].keys()))): if queue == 'total': continue for _type in ('disallowed', 'non-allowed', 'allowed'): self.msg += " '{0} {1}'={2}".format(queue, _type, queue_stats[_type].get(queue, 0)) return queue_stats
def timeout_handler(self, signum, frame): # pylint: disable=unused-argument for child in psutil.Process().children(): child.kill() time.sleep(1) qquit( 'UNKNOWN', 'self timed out after %d second%s' % (self.timeout, plural(self.timeout)))
def process_stats(self, stats): lowest_requests = None highest_requests = None lowest_regionserver = None highest_regionserver = None for regionserver in stats: if lowest_requests is None: lowest_requests = stats[regionserver] lowest_regionserver = regionserver if highest_requests is None: highest_requests = stats[regionserver] highest_regionserver = regionserver if stats[regionserver] > highest_requests: highest_requests = stats[regionserver] highest_regionserver = regionserver if stats[regionserver] < lowest_requests: lowest_requests = stats[regionserver] lowest_regionserver = regionserver # simple algo - let me know if you think can be a better calculation imbalance = (highest_requests - lowest_requests) / max(highest_requests, 1) * 100 num_regionservers = len(stats) self.msg = 'HBase RegionServers reqs/sec imbalance = {:.0f}% across {} RegionServer{}'\ .format(imbalance, num_regionservers, plural(num_regionservers)) self.check_thresholds(imbalance) if self.verbose or not self.is_ok(): self.msg += ' [min reqs/sec={} on {} / max reqs/sec={} on {}]'\ .format(lowest_requests, lowest_regionserver, highest_requests, highest_regionserver) self.msg += ' | reqs_per_sec_balance={:.2f}%{} lowest_requests_per_sec={} highest_requests_per_sec={}'\ .format(imbalance, self.get_perf_thresholds(), lowest_requests, highest_requests)
def process_stats(self, stats): lowest_requests = None highest_requests = None lowest_regionserver = None highest_regionserver = None for regionserver in stats: if lowest_requests is None: lowest_requests = stats[regionserver] lowest_regionserver = regionserver if highest_requests is None: highest_requests = stats[regionserver] highest_regionserver = regionserver if stats[regionserver] > highest_requests: highest_requests = stats[regionserver] highest_regionserver = regionserver if stats[regionserver] < lowest_requests: lowest_requests = stats[regionserver] lowest_regionserver = regionserver # simple algo - let me know if you think can be a better calculation imbalance = (highest_requests - lowest_requests) / max( highest_requests, 1) * 100 num_regionservers = len(stats) self.msg = 'HBase RegionServers reqs/sec imbalance = {:.0f}% across {} RegionServer{}'\ .format(imbalance, num_regionservers, plural(num_regionservers)) self.check_thresholds(imbalance) if self.verbose or not self.is_ok(): self.msg += ' [min reqs/sec={} on {} / max reqs/sec={} on {}]'\ .format(lowest_requests, lowest_regionserver, highest_requests, highest_regionserver) self.msg += ' | reqs_per_sec_balance={:.2f}%{} lowest_requests_per_sec={} highest_requests_per_sec={}'\ .format(imbalance, self.get_perf_thresholds(), lowest_requests, highest_requests)
def print_stats(self, host): stats = self.stats tstamp = time.strftime('%F %T') if not stats: print( "No regionserver stats found. Did you specify correct regionserver addresses and --port?" ) sys.exit(1) if self.first_iteration: log.info( 'first iteration, skipping iteration until we have a differential' ) print('{}\t{} rate stats will be available in next iteration in {} sec{}'\ .format(tstamp, host, self.interval, plural(self.interval))) self.first_iteration = 0 return for metric in self.request_types: if self.request_type and metric not in self.request_type: continue try: val = '{:8.0f}'.format(stats[host][metric]) # might happen if server is down for maintenance - in which case N/A and retry later rather than crash except KeyError: val = 'N/A' print('{:20s}\t{:20s}\t{:10s}\t{}'\ .format(tstamp, host, metric, val)) print()
def run(self): self.no_args() directory = self.get_opt('directory') validate_directory(directory) directory = os.path.abspath(directory) try: repo = git.Repo(directory) except InvalidGitRepositoryError as _: raise CriticalError( "directory '{}' does not contain a valid Git repository!". format(directory)) try: untracked_files = repo.untracked_files num_untracked_files = len(untracked_files) changed_files = [item.a_path for item in repo.index.diff(None)] changed_files = [ filename for filename in changed_files if filename not in untracked_files ] num_changed_files = len(changed_files) except InvalidGitRepositoryError as _: raise CriticalError(_) except TypeError as _: raise CriticalError(_) self.msg = '{} changed file{}'.format(num_changed_files, plural(num_changed_files)) self.msg += ', {} untracked file{}'.format(num_untracked_files, plural(num_untracked_files)) self.msg += " in Git checkout at directory '{}'".format(directory) uncommitted_staged_changes = 0 if changed_files or untracked_files: self.critical() if self.verbose: if changed_files: self.msg += ' (changed files: {})'.format( ', '.join(changed_files)) if untracked_files: self.msg += ' (untracked files: {})'.format( ', '.join(untracked_files)) elif repo.is_dirty(): self.msg += ', uncommitted staged changes detected!' self.critical() uncommitted_staged_changes = 1 self.msg += ' | changed_files={};0;0 untracked_files={};0;0'.format( num_changed_files, num_untracked_files) self.msg += ' uncommitted_staged_changes={};0;0'.format( uncommitted_staged_changes)
def check_times(self, start_date, end_date): start_date = str(start_date).strip() end_date = str(end_date).strip() invalid_dates = ('', 'null', 'None', None) age_timedelta = None runtime_delta = None if start_date not in invalid_dates and \ end_date not in invalid_dates: try: start_datetime = datetime.strptime(start_date, '%m/%d/%Y %H:%M:%S') end_datetime = datetime.strptime(end_date, '%m/%d/%Y %H:%M:%S') except ValueError as _: qquit('UNKNOWN', 'error parsing date time format: {0}'.format(_)) runtime_delta = end_datetime - start_datetime runtime_delta_secs = self.timedelta_seconds(runtime_delta) self.msg += ' in {0}'.format(sec2human(runtime_delta_secs)) if self.max_runtime is not None and (runtime_delta_secs / 60.0) > self.max_runtime: self.warning() self.msg += ' (greater than {0} min{1}!)'.format(str(self.max_runtime).rstrip('0').rstrip('.'), plural(self.max_runtime)) if self.min_runtime is not None and (runtime_delta_secs / 60.0) < self.min_runtime: self.warning() self.msg += ' (less than {0} min{1}!)'.format(str(self.min_runtime).rstrip('0').rstrip('.'), plural(self.min_runtime)) age_timedelta = datetime.now() - start_datetime age_timedelta_secs = self.timedelta_seconds(age_timedelta) if self.verbose: self.msg += ", start date = '{startdate}', end date = '{enddate}'".\ format(startdate=start_date, enddate=end_date) if age_timedelta is not None: self.msg += ', started {0} ago'.format(sec2human(age_timedelta_secs)) if self.max_age is not None and age_timedelta is not None \ and age_timedelta_secs > (self.max_age * 60.0): self.warning() self.msg += ' (last run started more than {0} min{1} ago!)'.format(str(self.max_age) .rstrip('0') .rstrip('.'), plural(self.max_age)) # Do not output variable number of fields at all if agedelta is not available as that breaks PNP4Nagios graphing if age_timedelta is not None and runtime_delta: self.msg += ' |' self.msg += ' runtime={0}s;{1}'.format(runtime_delta_secs, self.max_runtime * 60 \ if self.max_runtime else '') self.msg += ' age={0}s;{1}'.format(age_timedelta_secs, self.max_age * 60 if self.max_age else '') self.msg += ' auth_time={auth_time}s query_time={query_time}s'.format(auth_time=self.auth_time, query_time=self.query_time)
def check_ingestion(self, num, filter_opts=None, max_age=None, max_runtime=None): log.info('checking ingestion history') json_dict = self.get_ingestions(num, filter_opts) info = '' if self.verbose: for key in sorted(filter_opts): info += " {0}='{1}'".format(key, filter_opts[key]) try: results = json_dict['result'] if not results: qquit('CRITICAL', "no results found for ingestion{0}"\ .format('{0}. {1}'.format(info, self.extract_response_message(json_dict)) + \ 'Perhaps you specified incorrect filters? Use --list to see existing ingestions')) num_results = len(results) log.info('%s ingestion history results returned', num_results) self.check_statuses(results) if num: self.msg += ' out of last {0} ingest{1}'.format( num_results, plural(num_results)) if self.history_mins: self.msg += ' within last {0} ({1} min{2})'.format( sec2human(self.history_mins * 60), str(self.history_mins).rstrip('0').rstrip('.'), plural(self.history_mins)) longest_incomplete_timedelta = self.check_longest_incomplete_ingest( results, max_runtime) age_timedelta_secs = self.check_last_ingest_age(results, max_age=max_age) self.msg_filter_details(filter_opts=filter_opts) self.msg += ' |' self.msg += ' last_ingest_age={0}s;{1}'.format( age_timedelta_secs, max_age * 3600 if max_age else '') self.msg += ' longest_incomplete_ingest_age={0}s;{1}'\ .format(self.timedelta_seconds(longest_incomplete_timedelta) if longest_incomplete_timedelta else 0, max_age * 3600 if max_age else '') self.msg += ' auth_time={auth_time}s query_time={query_time}s'.format( auth_time=self.auth_time, query_time=self.query_time) except KeyError as _: qquit('UNKNOWN', 'error parsing workflow execution history: {0}'.format(_))
def timeout_handler(self, signum, frame): # pylint: disable=unused-argument # problem with this is that it'll print and then the exit exception will be caught and quit() printed again # raising a custom TimeoutException will need to be handled in main, but that would also likely print and be # re-caught and re-printed by NagiosPlugin #print('self timed out after %d second%s' % (self.timeout, plural(self.timeout))) #sys.exit(ERRORS['UNKNOWN']) # if doing die the same thing same will happen since die is a custom func which prints and then calls exit, # only exit would be caught qquit('UNKNOWN', 'self timed out after %d second%s' % (self.timeout, plural(self.timeout)))
def parse_json(self, json_data): if not isList(json_data): raise UnknownError( 'non-list returned by Presto for nodes failed. {0}'.format( support_msg_api())) num_failed_nodes = len(json_data) self.msg = 'Presto SQL - {0} worker node{1} failed'.format( num_failed_nodes, plural(num_failed_nodes)) self.check_thresholds(num_failed_nodes)
def parse_json(self, json_data): app_list = self.get_app_list(json_data) (num_shells_breaching_sla, num_matching_apps, max_elapsed, max_threshold_msg) = \ self.check_app_elapsed_times(app_list) self.msg += '{0}, checked {1} Spark Shell{2} out of {3} running apps'\ .format(num_shells_breaching_sla, num_matching_apps, plural(num_matching_apps), len(app_list)) + \ ', longest running Spark Shell = {0} secs{1}'\ .format(max_elapsed, max_threshold_msg) self.msg += ' | num_spark_shells_breaching_SLA={0} max_elapsed_spark_shell_time={1}{2}'\ .format(num_shells_breaching_sla, max_elapsed, self.get_perf_thresholds())
def parse_json(self, json_data): log.info('parsing response') try: live_nodes = json_data['beans'][0]['LiveNodes'] live_node_data = json.loads(live_nodes) num_datanodes = len(live_node_data) if num_datanodes < 1: raise CriticalError("no live datanodes returned by JMX API from namenode '{0}:{1}'"\ .format(self.host, self.port)) max_blocks = 0 min_blocks = None for datanode in live_node_data: blocks = live_node_data[datanode]['numBlocks'] if not isInt(blocks): raise UnknownError( 'numBlocks {} is not an integer! {}'.format( blocks, support_msg_api())) blocks = int(blocks) log.info("datanode '%s' has %s blocks", datanode, blocks) if blocks > max_blocks: max_blocks = blocks if min_blocks is None or blocks < min_blocks: min_blocks = blocks log.info("max blocks on a single datanode = %s", max_blocks) log.info("min blocks on a single datanode = %s", min_blocks) if min_blocks is None: raise UnknownError('min_blocks is None') divisor = min_blocks if min_blocks < 1: log.info( "min blocks < 1, resetting divisor to 1 (% will be very high)" ) divisor = 1 block_imbalance = float("{0:.2f}".format( (max_blocks - min_blocks) / divisor * 100)) self.msg = '{0}% block imbalance across {1} datanode{2}'\ .format(block_imbalance, num_datanodes, plural(num_datanodes)) self.ok() self.check_thresholds(block_imbalance) if self.verbose: self.msg += ' (min blocks = {0}, max blocks = {1})'.format( min_blocks, max_blocks) self.msg += " | block_imbalance={0}%".format(block_imbalance) self.msg += self.get_perf_thresholds() self.msg += " num_datanodes={0}".format(num_datanodes) self.msg += " min_blocks={0}".format(min_blocks) self.msg += " max_blocks={0}".format(max_blocks) except KeyError as _: raise UnknownError("failed to parse json returned by NameNode at '{0}:{1}': {2}. {3}"\ .format(self.host, self.port, _, support_msg_api())) except ValueError as _: raise UnknownError("invalid json returned for LiveNodes by Namenode '{0}:{1}': {2}"\ .format(self.host, self.port, _))
def run(self): racks = self.get_rack_info() num_racks = len(racks) self.msg = '{} rack{} configured'.format(num_racks, plural(num_racks)) if num_racks < 2: self.warning() self.msg += ' (no rack resilience!)' default_rack = '/default-rack' num_nodes_left_in_default_rack = 0 if default_rack in racks: self.warning() num_nodes_left_in_default_rack = len(racks[default_rack]) msg = "{num} node{plural} left in '{default_rack}'!"\ .format(num=num_nodes_left_in_default_rack, plural=plural(num_nodes_left_in_default_rack), default_rack=default_rack) if self.verbose: msg += ' [{}]'.format(', '.join(racks[default_rack])) self.msg = msg + ' - ' + self.msg self.msg += ' | hdfs_racks={};2 nodes_in_default_rack={};0 query_time={:.2f}s'\ .format(num_racks, num_nodes_left_in_default_rack, self.query_time)
def end(self): if self.node_count is None: raise UnknownError('node count is not set!') self.msg = '{0} {1}{2} {3}'.format(self.node_count, self.agent_name, plural(self.node_count), self.state) self.check_thresholds(name=self.agent_name, result=self.node_count) if self.additional_info: self.msg += ', {0}'.format(self.additional_info) self.msg += ' | {0}s_{1}={2:d}{3}'.format(self.agent_name, self.state, self.node_count, self.get_perf_thresholds()) if self.additional_perfdata: self.msg += ' {0}'.format(self.additional_perfdata) qquit(self.status, self.msg)
def end(self): if self.node_count is None: raise UnknownError('node count is not set!') self.msg = '{0} {1}{2} {3}'.format(self.node_count, self.agent_name, plural(self.node_count), self.state) self.check_thresholds(self.node_count) if self.additional_info: self.msg += ', {0}'.format(self.additional_info) self.msg += ' | {0}s_{1}={2:d}s{3}'.format(self.agent_name, self.state, self.node_count, self.get_perf_thresholds()) if self.additional_perfdata: self.msg += ' {0}'.format(self.additional_perfdata) qquit(self.status, self.msg)
def check_last_ingest_age(self, ingestion_date, max_age): log.info('checking last ingest age') age_timedelta = self.get_timedelta(ingestion_date=ingestion_date) if self.verbose: self.msg += ", last ingest start date = '{ingestion_date}'".format( ingestion_date=ingestion_date) self.msg += ', started {0} ago'.format( sec2human(age_timedelta.seconds)) if max_age is not None and age_timedelta.seconds > (max_age * 60.0): self.warning() self.msg += ' (last run started more than {0} min{1} ago!)'.format( str(max_age).rstrip('0').rstrip('.'), plural(max_age)) return age_timedelta
def check_statuses(self, results): # known statuses from doc: SUCCESS / INGESTION FAILED / WORKFLOW FAILED / INCOMPLETE log.info('checking statuses') result_statuses = {} num_results = len(results) for item in results: status = item['status'] result_statuses[status] = result_statuses.get(status, 0) result_statuses[status] += 1 if not result_statuses: code_error('no ingestion status results parsed') if 'SUCCESS' not in result_statuses: self.msg += 'NO SUCCESSFUL INGESTS in history of last {0} ingest runs! '.format(num_results) self.warning() self.msg += 'ingestion{0} status: '.format(plural(num_results)) for status in result_statuses: if status not in ('SUCCESS', 'INCOMPLETE'): self.critical() self.msg += '{0} = {1} time{2}, '.format(status, result_statuses[status], plural(result_statuses[status])) self.msg = self.msg.rstrip(', ') return result_statuses
def parse_json(self, json_data): log.info('parsing response') try: live_nodes_str = json_data['beans'][0]['LiveNodes'] dead_nodes_str = json_data['beans'][0]['DeadNodes'] decom_nodes_str = json_data['beans'][0]['DecomNodes'] live_nodes = json.loads(live_nodes_str) dead_nodes = json.loads(dead_nodes_str) decom_nodes = json.loads(decom_nodes_str) self.print_nodes(live_nodes=live_nodes, dead_nodes=dead_nodes, decom_nodes=decom_nodes) last_contact_secs = None for item in live_nodes: if self.match_datanode(self.datanode, item): last_contact_secs = live_nodes[item]['lastContact'] # always check decom and dead nodes regardless if last_contact_secs was found in live nodes # gives an additional safety check to escalate to warning / critical self.msg = '' for item in decom_nodes: if self.match_datanode(self.datanode, item): last_contact_secs = decom_nodes[item]['lastContact'] self.warning() self.msg = 'Decommissioning ' for item in dead_nodes: if self.match_datanode(self.datanode, item): last_contact_secs = dead_nodes[item]['lastContact'] self.critical() self.msg = 'Dead ' if last_contact_secs is None: raise UnknownError("datanode '{0}' is not present in any of the live, ".format(self.datanode) + \ "decommissioning or dead node lists!") if not isInt(last_contact_secs): raise UnknownError("non-integer '{0}' returned for last contact seconds by namenode '{1}:{2}'"\ .format(last_contact_secs, self.host, self.port)) last_contact_secs = int(last_contact_secs) if last_contact_secs < 0: raise UnknownError( 'last_contact_secs {} < 0!'.format(last_contact_secs)) self.msg += "HDFS datanode '{0}' last contact with namenode was {1} sec{2} ago"\ .format(self.datanode, last_contact_secs, plural(last_contact_secs)) self.check_thresholds(last_contact_secs) self.msg += ' | datanode_last_contact_secs={0}'.format( last_contact_secs) self.msg += self.get_perf_thresholds() except KeyError as _: raise UnknownError("failed to parse json returned by NameNode at '{0}:{1}': {2}. {3}"\ .format(self.host, self.port, _, support_msg_api())) except ValueError as _: raise UnknownError("invalid json returned for LiveNodes by Namenode '{0}:{1}': {2}"\ .format(self.host, self.port, _))
def parse_json(self, json_data): log.info('parsing response') try: live_nodes = json_data['beans'][0]['LiveNodes'] live_node_data = json.loads(live_nodes) num_datanodes = len(live_node_data) if num_datanodes < 1: raise CriticalError("no live datanodes returned by JMX API from namenode '{0}:{1}'"\ .format(self.host, self.port)) min_space = None max_space = 0 for datanode in live_node_data: used_space = live_node_data[datanode]['usedSpace'] if not isInt(used_space): raise UnknownError('usedSpace {} is not an integer! {}'.format(used_space, support_msg_api())) used_space = int(used_space) log.info("datanode '%s' used space = %s", datanode, used_space) if min_space is None or used_space < min_space: min_space = used_space if used_space > max_space: max_space = used_space divisor = max_space if divisor < 1: log.info('min used space < 1, resetting divisor to 1 (% will likely be very high)') divisor = 1 if max_space < min_space: raise UnknownError('max_space < min_space') largest_imbalance_pc = float('{0:.2f}'.format(((max_space - min_space) / divisor) * 100)) if largest_imbalance_pc < 0: raise UnknownError('largest_imbalance_pc < 0') self.ok() self.msg = '{0}% HDFS imbalance on space used'.format(largest_imbalance_pc) self.check_thresholds(largest_imbalance_pc) self.msg += ' across {0:d} datanode{1}'.format(num_datanodes, plural(num_datanodes)) if self.verbose: self.msg += ', min used space = {0}, max used space = {1}'.format(min_space, max_space) if self.verbose and (self.is_warning() or self.is_critical()): self.msg += ' [imbalanced nodes: ' for datanode in live_node_data: used_space = live_node_data[datanode]['usedSpace'] if (used_space / max_space * 100) > self.thresholds['warning']['upper']: self.msg += '{0}({1:.2f%}),'.format(datanode, used_space) self.msg = self.msg.rstrip(',') + ']' self.msg += " | 'HDFS imbalance on space used %'={0}".format(largest_imbalance_pc) self.msg += self.get_perf_thresholds() self.msg += " num_datanodes={0}".format(num_datanodes) self.msg += " min_used_space={0}".format(min_space) self.msg += " max_used_space={0}".format(max_space) except KeyError as _: raise UnknownError("failed to parse json returned by NameNode at '{0}:{1}': {2}. {3}"\ .format(self.host, self.port, _, support_msg_api()))
def run(self): server_url = '{proto}://{host}:{port}'.format(proto=self.protocol, host=self.host, port=self.port) try: log.debug('setting up Jenkins connection to %s', server_url) start_time = time.time() server = jenkins.Jenkins(server_url, username=self.user, password=self.password, timeout=self.timeout / 3) if log.isEnabledFor(logging.DEBUG): log.debug('getting user') user = server.get_whoami() log.debug('connected as user %s', jsonpp(user)) log.debug('getting plugin info') #plugins = server.get_plugins() # deprecated but .get_plugins() output is not JSON serializable # so must use old deprecated method get_plugins_info() :-/ plugins = server.get_plugins_info() query_time = time.time() - start_time except jenkins.JenkinsException as _: raise CriticalError(_) if log.isEnabledFor(logging.DEBUG): log.debug('%s', jsonpp(plugins)) plugin_count = len(plugins) update_count = 0 for plugin in plugins: if plugin['hasUpdate']: update_count += 1 self.msg += " {0} plugin update{1} available out of {2} installed plugin{3}".format( update_count, plural(update_count), plugin_count, plural(plugin_count)) if update_count: self.warning() self.msg += ' | updates_available={0};1 plugins_installed={1} query_time={2:.4f}s'.format( update_count, plugin_count, query_time)
def run(self): iam = boto3.client('iam') log.info('getting account summary') _ = iam.get_account_summary() log.debug('%s', jsonpp(_)) account_summary = _['SummaryMap'] mfa_enabled = account_summary['AccountMFAEnabled'] access_keys = account_summary['AccountAccessKeysPresent'] if access_keys or not mfa_enabled: self.warning() self.msg = 'AWS root account MFA enabled = {}{}'.format( bool(mfa_enabled), ' (!)' if not mfa_enabled else "") self.msg += ', {} access key{} found{}'.format( access_keys, plural(access_keys), ' (!)' if access_keys else "")
def parse_json(self, json_data): log.info('parsing response') try: live_nodes_str = json_data['beans'][0]['LiveNodes'] dead_nodes_str = json_data['beans'][0]['DeadNodes'] decom_nodes_str = json_data['beans'][0]['DecomNodes'] live_nodes = json.loads(live_nodes_str) dead_nodes = json.loads(dead_nodes_str) decom_nodes = json.loads(decom_nodes_str) self.print_nodes(live_nodes=live_nodes, dead_nodes=dead_nodes, decom_nodes=decom_nodes) last_contact_secs = None for item in live_nodes: if self.match_datanode(self.datanode, item): last_contact_secs = live_nodes[item]['lastContact'] # always check decom and dead nodes regardless if last_contact_secs was found in live nodes # gives an additional safety check to escalate to warning / critical self.msg = '' for item in decom_nodes: if self.match_datanode(self.datanode, item): last_contact_secs = decom_nodes[item]['lastContact'] self.warning() self.msg = 'Decommissioning ' for item in dead_nodes: if self.match_datanode(self.datanode, item): last_contact_secs = dead_nodes[item]['lastContact'] self.critical() self.msg = 'Dead ' if last_contact_secs is None: raise UnknownError("datanode '{0}' is not present in any of the live, ".format(self.datanode) + \ "decommissioning or dead node lists!") if not isInt(last_contact_secs): raise UnknownError("non-integer '{0}' returned for last contact seconds by namenode '{1}:{2}'"\ .format(last_contact_secs, self.host, self.port)) last_contact_secs = int(last_contact_secs) if last_contact_secs < 0: raise UnknownError('last_contact_secs {} < 0!'.format(last_contact_secs)) self.msg += "HDFS datanode '{0}' last contact with namenode was {1} sec{2} ago"\ .format(self.datanode, last_contact_secs, plural(last_contact_secs)) self.check_thresholds(last_contact_secs) self.msg += ' | datanode_last_contact_secs={0}'.format(last_contact_secs) self.msg += self.get_perf_thresholds() except KeyError as _: raise UnknownError("failed to parse json returned by NameNode at '{0}:{1}': {2}. {3}"\ .format(self.host, self.port, _, support_msg_api())) except ValueError as _: raise UnknownError("invalid json returned for LiveNodes by Namenode '{0}:{1}': {2}"\ .format(self.host, self.port, _))
def check_ingestion(self, num, filter_opts=None, max_age=None, max_runtime=None): log.info('checking ingestion history') json_dict = self.get_ingestions(num, filter_opts) info = '' if self.verbose: for key in sorted(filter_opts): info += " {0}='{1}'".format(key, filter_opts[key]) try: results = json_dict['result'] if not results: qquit('CRITICAL', "no results found for ingestion{0}"\ .format('{0}. {1}'.format(info, self.extract_response_message(json_dict)) + \ 'Perhaps you specified incorrect filters? Use --list to see existing ingestions')) num_results = len(results) log.info('%s ingestion history results returned', num_results) self.check_statuses(results) if num: self.msg += ' out of last {0} ingest{1}'.format(num_results, plural(num_results)) if self.history_mins: self.msg += ' within last {0} ({1} min{2})'.format(sec2human(self.history_mins * 60), str(self.history_mins).rstrip('0').rstrip('.'), plural(self.history_mins)) longest_incomplete_timedelta = self.check_longest_incomplete_ingest(results, max_runtime) age_timedelta_secs = self.check_last_ingest_age(results, max_age=max_age) self.msg_filter_details(filter_opts=filter_opts) self.msg += ' |' self.msg += ' last_ingest_age={0}s;{1}'.format(age_timedelta_secs, max_age * 3600 if max_age else '') self.msg += ' longest_incomplete_ingest_age={0}s;{1}'\ .format(self.timedelta_seconds(longest_incomplete_timedelta) if longest_incomplete_timedelta else 0, max_age * 3600 if max_age else '') self.msg += ' auth_time={auth_time}s query_time={query_time}s'.format(auth_time=self.auth_time, query_time=self.query_time) except KeyError as _: qquit('UNKNOWN', 'error parsing workflow execution history: {0}'.format(_))
def run(self): self.no_args() directory = self.get_opt('directory') validate_directory(directory) directory = os.path.abspath(directory) try: repo = git.Repo(directory) except InvalidGitRepositoryError as _: raise CriticalError("directory '{}' does not contain a valid Git repository!".format(directory)) try: untracked_files = repo.untracked_files num_untracked_files = len(untracked_files) changed_files = [item.a_path for item in repo.index.diff(None)] changed_files = [filename for filename in changed_files if filename not in untracked_files] num_changed_files = len(changed_files) except InvalidGitRepositoryError as _: raise CriticalError(_) except TypeError as _: raise CriticalError(_) self.msg = '{} changed file{}'.format(num_changed_files, plural(num_changed_files)) self.msg += ', {} untracked file{}'.format(num_untracked_files, plural(num_untracked_files)) self.msg += " in Git checkout at directory '{}'".format(directory) uncommitted_staged_changes = 0 if changed_files or untracked_files: self.critical() if self.verbose: if changed_files: self.msg += ' (changed files: {})'.format(', '.join(changed_files)) if untracked_files: self.msg += ' (untracked files: {})'.format(', '.join(untracked_files)) elif repo.is_dirty(): self.msg += ', uncommitted staged changes detected!' self.critical() uncommitted_staged_changes = 1 self.msg += ' | changed_files={};0;0 untracked_files={};0;0'.format(num_changed_files, num_untracked_files) self.msg += ' uncommitted_staged_changes={};0;0'.format(uncommitted_staged_changes)
def parse_json(self, json_data): dynamic = self.get_key(json_data, 'dynamic') peers = self.get_key(json_data, 'peers') if not isList(peers): raise UnknownError('\'peers\' field is not a list as expected! {0}'.format(support_msg_api())) peer_count = len(peers) if self.regex: regex = re.compile(self.regex, re.I) if not self.find_peer(regex, peers): self.msg += 'no peer found matching \'{0}\', '.format(self.regex) self.critical() self.msg += '{0} peer{1} found'.format(peer_count, plural(peer_count)) self.check_thresholds(peer_count) self.msg += ', dynamic = {0}'.format(dynamic) self.msg += ' | hiveserver2_llap_peers={0}{1}'.format(peer_count, self.get_perf_thresholds(boundary='lower'))
def parse_json(self, json_data): log.info('parsing response') try: live_nodes = json_data['beans'][0]['LiveNodes'] live_node_data = json.loads(live_nodes) num_datanodes = len(live_node_data) if num_datanodes < 1: raise CriticalError("no live datanodes returned by JMX API from namenode '{0}:{1}'"\ .format(self.host, self.port)) max_blocks = 0 min_blocks = None for datanode in live_node_data: blocks = live_node_data[datanode]['numBlocks'] if not isInt(blocks): raise UnknownError('numBlocks {} is not an integer! {}'.format(blocks, support_msg_api())) blocks = int(blocks) log.info("datanode '%s' has %s blocks", datanode, blocks) if blocks > max_blocks: max_blocks = blocks if min_blocks is None or blocks < min_blocks: min_blocks = blocks log.info("max blocks on a single datanode = %s", max_blocks) log.info("min blocks on a single datanode = %s", min_blocks) if min_blocks is None: raise UnknownError('min_blocks is None') divisor = min_blocks if min_blocks < 1: log.info("min blocks < 1, resetting divisor to 1 (% will be very high)") divisor = 1 block_imbalance = float("{0:.2f}".format((max_blocks - min_blocks) / divisor * 100)) self.msg = '{0}% block imbalance across {1} datanode{2}'\ .format(block_imbalance, num_datanodes, plural(num_datanodes)) self.ok() self.check_thresholds(block_imbalance) if self.verbose: self.msg += ' (min blocks = {0}, max blocks = {1})'.format(min_blocks, max_blocks) self.msg += " | block_imbalance={0}%".format(block_imbalance) self.msg += self.get_perf_thresholds() self.msg += " num_datanodes={0}".format(num_datanodes) self.msg += " min_blocks={0}".format(min_blocks) self.msg += " max_blocks={0}".format(max_blocks) except KeyError as _: raise UnknownError("failed to parse json returned by NameNode at '{0}:{1}': {2}. {3}"\ .format(self.host, self.port, _, support_msg_api())) except ValueError as _: raise UnknownError("invalid json returned for LiveNodes by Namenode '{0}:{1}': {2}"\ .format(self.host, self.port, _))
def check_missing_traits(self, traits): if not isList(traits): raise UnknownError('traits non-list returned. {0}'.format(support_msg_api())) if self.traits: missing_traits = [] #traits = [t.lower() for t in traits] for trait in self.traits: #if trait.lower() not in traits: if trait not in traits: missing_traits.append(trait) if missing_traits: self.critical() self.msg += " (expected trait{plural} '{missing_traits}' not found in entity)".format( missing_traits=','.join(missing_traits), plural=plural(self.traits)) return missing_traits return []
def check_table_regions(self): log.info('checking regions for table \'%s\'', self.table) regions = None try: table = self.conn.table(self.table) regions = table.regions() except HBaseIOError as _: #if 'org.apache.hadoop.hbase.TableNotFoundException' in _.message: if 'TableNotFoundException' in _.message: qquit('CRITICAL', 'table \'{0}\' does not exist'.format(self.table)) else: qquit('CRITICAL', _) except (socket.error, socket.timeout, ThriftException) as _: qquit('CRITICAL', _) if log.isEnabledFor(logging.DEBUG): log.debug('%s', jsonpp(regions)) if not regions: qquit('CRITICAL', 'failed to get regions for table \'{0}\''.format(self.table)) if not isList(regions): qquit('UNKNOWN', 'region info returned is not a list! ' + support_msg_api()) num_regions = len(regions) log.info('num regions: %s', num_regions) self.msg = 'HBase table \'{0}\' has {1} region{2}'.format(self.table, num_regions, plural(num_regions)) self.check_thresholds(num_regions) num_unassigned_regions = 0 for region in regions: try: if not region['server_name']: #log.debug('region \'%s\' is not assigned to any server', region['name']) num_unassigned_regions += 1 except KeyError as _: qquit('UNKNOWN', 'failed to find server assigned to region. ' + support_msg_api()) log.info('num unassigned regions: %s', num_unassigned_regions) self.msg += ', {0} unassigned region{1}'.format(num_unassigned_regions, plural(num_unassigned_regions)) if num_unassigned_regions > 0: self.warning() self.msg += '!' self.msg += ' |' self.msg += ' num_regions={0}'.format(num_regions) + self.get_perf_thresholds(boundary='lower') self.msg += ' num_unassigned_regions={0};1;0'.format(num_unassigned_regions) log.info('finished, closing connection') self.conn.close()
def check_missing_tags(self, tags): if not isList(tags): raise UnknownError('tags non-list returned. {0}'.format(support_msg_api())) if self.tags: missing_tags = [] #tags = [t.lower() for t in tags] for tag in self.tags: #if tag.lower() not in tags: if tag not in tags: missing_tags.append(tag) if missing_tags: self.critical() self.msg += " (expected tag{plural} '{missing_tags}' not found in entity)".format( missing_tags=','.join(missing_tags), plural=plural(self.tags)) return missing_tags return []
def run(self): log.info('querying %s', self.software) url = '{protocol}://{host}:{port}/admin/info'\ .format(host=self.host, port=self.port, protocol=self.protocol) log.debug('GET %s', url) try: req = requests.get(url) #req = requests.get(url, auth=HTTPBasicAuth(self.user, self.password)) except requests.exceptions.RequestException as _: errhint = '' if 'BadStatusLine' in str(_.message): errhint = ' (possibly connecting to an SSL secured port without using --ssl?)' elif self.protocol == 'https' and 'unknown protocol' in str(_.message): errhint = ' (possibly connecting to a plain HTTP port with the -S / --ssl switch enabled?)' qquit('CRITICAL', str(_) + errhint) log.debug("response: %s %s", req.status_code, req.reason) log.debug("content:\n%s\n%s\n%s", '='*80, req.content.strip(), '='*80) if req.status_code != 200: qquit('CRITICAL', '{0}: {1}'.format(req.status_code, req.reason)) soup = BeautifulSoup(req.content, 'html.parser') if log.isEnabledFor(logging.DEBUG): log.debug("BeautifulSoup prettified:\n{0}\n{1}".format(soup.prettify(), '='*80)) try: license_tag = soup.find('td', {'class': 'lic-value'}) if not license_tag: qquit('UNKNOWN', 'failed to find license tag while parsing') expiry = license_tag.text.strip() license_datetime = datetime.strptime(expiry, '%Y-%m-%d %H:%M:%S') delta = license_datetime - datetime.now() days = delta.days if days < 0: qquit('CRITICAL', "license has already expired on '{0}'".format(expiry)) self.msg = "{software} license expires in {days} day{plural}"\ .format(software=self.software, days=days, plural=plural(days)) self.check_thresholds(days) self.msg += ", expiry date = '{expiry}' | days_until_expiry={days}{thresholds}"\ .format(expiry=expiry, days=days, thresholds=self.get_perf_thresholds(boundary='lower')) except (AttributeError, TypeError) as _: qquit('UNKNOWN', 'error parsing output from {software}: {exception}: {error}. {support_msg}'\ .format(software=self.software, exception=type(_).__name__, error=_, support_msg=support_msg_api()))
def check_longest_incomplete_ingest(self, result, max_runtime=None): log.info('checking longest running incomplete ingest') longest_incomplete_timedelta = None for item in result: status = item['status'] if status == 'INCOMPLETE' and max_runtime is not None: runtime_delta = self.get_timedelta(item['ingestionTimeFormatted']) if longest_incomplete_timedelta is None or \ self.timedelta_seconds(runtime_delta) > self.timedelta_seconds(longest_incomplete_timedelta): longest_incomplete_timedelta = runtime_delta if max_runtime is not None and \ longest_incomplete_timedelta is not None and \ self.timedelta(longest_incomplete_timedelta) > max_runtime * 60.0: self.warning() self.msg += ', longest incomplete ingest runtime = {0} ago! '\ .format(sec2human(self.timedelta_seconds(longest_incomplete_timedelta))) + \ '(greater than expected {0} min{1})'\ .format(str(max_runtime).rstrip('0').rstrip('.'), plural(max_runtime)) return longest_incomplete_timedelta
def check_last_ingest_age(self, results, max_age): log.info('checking last ingest age') if not isList(results): code_error('passed non-list to check_last_ingest_age()') # newest is first # effectiveDate is null in testing (docs says it's a placeholder for future use) # using ingestionTimeFormatted instead, could also use ingestionTime which is timestamp in millis ingestion_date = results[0]['ingestionTimeFormatted'] age_timedelta = self.get_timedelta(ingestion_date=ingestion_date) age_timedelta_secs = self.timedelta_seconds(age_timedelta) if self.verbose: self.msg += ", last ingest start date = '{ingestion_date}'".format(ingestion_date=ingestion_date) self.msg += ', started {0} ago'.format(sec2human(age_timedelta_secs)) if max_age is not None and age_timedelta_secs > (max_age * 60.0): self.warning() self.msg += ' (last run started more than {0} min{1} ago!)'.format(str(max_age) .rstrip('0') .rstrip('.'), plural(max_age)) return age_timedelta_secs
def run(self): start_time = time.time() for page in range(1, self.max_pages + 1): url = 'https://registry.hub.docker.com/v2/repositories/{repo}/buildhistory?page={page}'\ .format(repo=self.repo, page=page) req = self.request.get(url) if log.isEnabledFor(logging.DEBUG): log.debug(jsonpp(req.content)) json_data = json.loads(req.content) log.debug('%s out of %s results returned for page %s', len(json_data['results']), json_data['count'], page) if self.process_results(json_data): # not quite as accurate as before as it now includes processing time but close enough query_time = time.time() - start_time if '|' not in self.msg: self.msg += ' |' self.msg += ' query_time={0:.2f}s'.format(query_time) return True extra_info = '' if self.verbose: extra_info = ' ({0} page{1} of API output)'\ .format(self.max_pages, plural(self.max_pages)) raise UnknownError('no completed builds found in last {0} builds{1}'.format(self.max_pages * 10, extra_info))
def timeout_handler(self, signum, frame): # pylint: disable=unused-argument for child in psutil.Process().children(): child.kill() time.sleep(1) qquit("UNKNOWN", "self timed out after %d second%s" % (self.timeout, plural(self.timeout)))
def parse_results(self, content): build = self.get_latest_build(content) number = build['number'] log.info('build number = %s', number) if not isInt(number): raise UnknownError('build number returned is not an integer!') message = build['message'] log.info('message = %s', message) branch = build['branch'] log.info('branch = %s', branch) commit = build['commit'] log.info('commit = %s', commit) started_at = build['started_at'] log.info('started_at = %s', started_at) finished_at = build['finished_at'] log.info('finished_at = %s', finished_at) duration = build['duration'] log.info('duration = %s', duration) if not isInt(duration): raise UnknownError('duration returned is not an integer!') repository_id = build['repository_id'] log.info('repository_id = %s', repository_id) if not isInt(repository_id): raise UnknownError('repository_id returned is not an integer!') result = build['result'] log.info('result = %s', result) state = build['state'] log.info('state = %s', state) if result == 0: self.ok() status = "PASSED" else: self.critical() status = "FAILED" self.msg = "Travis CI build #{number} {status} for repo '{repo}' in {duration} secs".format(\ number=number, status=status, repo=self.repo, duration=duration) self.check_thresholds(duration) self.msg += ", started_at='{0}'".format(started_at) self.msg += ", finished_at='{0}'".format(finished_at) if self.verbose: self.msg += ", message='{0}'".format(message) self.msg += ", branch='{0}'".format(branch) self.msg += ", commit='{0}'".format(commit) self.msg += ", repository_id='{0}'".format(repository_id) if self.verbose or self.builds_in_progress > 0: self.msg += ", {0} build{1} in progress".format(self.builds_in_progress, plural(self.builds_in_progress)) self.msg += " | last_build_duration={duration}s{perf_thresholds} num_builds_in_progress={builds_in_progress}"\ .format(duration=duration, perf_thresholds=self.get_perf_thresholds(), builds_in_progress=self.builds_in_progress)
def run(self): server_url = '{proto}://{host}:{port}'.format(proto=self.protocol, host=self.host, port=self.port) try: log.debug('setting up Jenkins connection to %s', server_url) start_time = time.time() server = jenkins.Jenkins(server_url, username=self.user, password=self.password, timeout=self.timeout / 3) if log.isEnabledFor(logging.DEBUG): log.debug('getting user') user = server.get_whoami() log.debug('connected as user %s', jsonpp(user)) log.debug('getting plugin info') #plugins = server.get_plugins() # deprecated but .get_plugins() output is not JSON serializable # so must use old deprecated method get_plugins_info() :-/ plugins = server.get_plugins_info() query_time = time.time() - start_time except jenkins.JenkinsException as _: raise CriticalError(_) if log.isEnabledFor(logging.DEBUG): log.debug('%s', jsonpp(plugins)) plugin_count = len(plugins) update_count = 0 for plugin in plugins: if plugin['hasUpdate']: update_count += 1 self.msg += " {0} plugin update{1} available out of {2} installed plugin{3}".format(update_count, plural(update_count), plugin_count, plural(plugin_count)) if update_count: self.warning() self.msg += ' | updates_available={0};1 plugins_installed={1} query_time={2:.4f}s'.format(update_count, plugin_count, query_time)
def parse_json(self, json_data): if not isList(json_data): raise UnknownError('non-list returned by Presto for nodes failed. {0}'.format(support_msg_api())) num_failed_nodes = len(json_data) self.msg = 'Presto SQL - {0} worker node{1} failed'.format(num_failed_nodes, plural(num_failed_nodes)) self.check_thresholds(num_failed_nodes)