def req(self, url, method='post', body=None):
     assert isStr(method)
     log.debug('%s %s', method.upper(), url)
     headers = {"Content-Type": "application/json",
                "Accept": "application/json",
                "JSESSIONID": self.jsessionid}
     log.debug('headers: %s', headers)
     start_time = time.time()
     try:
         req = getattr(requests, method.lower())(url,
                                                 #cookies=self.jar,
                                                 data=body,
                                                 headers=headers)
         for cookie_tuple in req.cookies.items():
             if cookie_tuple[0] == 'JSESSIONID':
                 self.jsessionid = cookie_tuple[1].rstrip('/')
         timing = time.time() - start_time
     except requests.exceptions.RequestException as _:
         qquit('CRITICAL', _)
     if log.isEnabledFor(logging.DEBUG):
         log.debug("response: %s %s", req.status_code, req.reason)
         content = req.content
         try:
             content = jsonpp(req.content).strip()
         except ValueError:
             pass
         log.debug("content:\n%s\n%s\n%s", '='*80, content, '='*80)
     if req.status_code != 200:
         info = ''
         try:
             info = ': {0}'.format(json.loads(req.content)['result'])
         except (KeyError, ValueError):
             pass
         qquit('CRITICAL', "%s %s%s" % (req.status_code, req.reason, info))
     return (req, timing)
Exemplo n.º 2
0
 def parse_host_name(self, item): # pylint: disable=no-self-use
     if isStr(item):
         item = json.loads(item)
     try:
         return item['Hosts']['host_name']
     except KeyError as _:
         qquit('CRITICAL', 'failed to parse Ambari host name: %s' % _)
Exemplo n.º 3
0
 def usage(self, msg='', status='UNKNOWN'):
     if msg:
         print('%s\n' % msg)
     else:
         print(self.usagemsg)
     self.__parser.print_help()
     qquit(status)
Exemplo n.º 4
0
 def parse(self, content):
     # could also collect lines after 'Regions-in-transition' if parsing /dump
     # sample:
     # hbase:meta,,1.1588230740 state=PENDING_OPEN, \
     # ts=Tue Nov 24 08:26:45 UTC 2015 (1098s ago), server=amb2.service.consul,16020,1448353564099
     soup = BeautifulSoup(content, 'html.parser')
     #if log.isEnabledFor(logging.DEBUG):
     #    log.debug("BeautifulSoup prettified:\n%s\n%s", soup.prettify(), '='*80)
     # looks like HMaster UI doesn't print this section if there are no regions in transition, must assume zero
     regions_stuck_in_transition = 0
     try:
         headings = soup.findAll('h2')
         for heading in headings:
             log.debug("checking heading '%s'", heading)
             if heading.get_text() == "Regions in Transition":
                 log.debug('found Regions in Transition section header')
                 table = heading.find_next('table')
                 log.debug('checking first following table')
                 regions_stuck_in_transition = self.parse_table(table)
                 if not isInt(regions_stuck_in_transition):
                     qquit('UNKNOWN', 'parse error - ' +
                           'got non-integer \'{0}\' for regions stuck in transition when parsing HMaster UI'\
                           .format(regions_stuck_in_transition))
         return regions_stuck_in_transition
         #qquit('UNKNOWN', 'parse error - failed to find table data for regions stuck in transition')
     except (AttributeError, TypeError):
         qquit('UNKNOWN', 'failed to parse HBase Master UI status page. ' + support_msg())
Exemplo n.º 5
0
 def send_blueprint_file(self, filename, name=''):
     # log.debug('send_blueprint_file(%s, %s)' % (filename, name))
     validate_file(filename, 'blueprint', nolog=True)
     try:
         _ = open(str(filename))
         file_data = _.read()
     except IOError as _:
         err = "failed to read Ambari Blueprint from file '%s': %s" % (file, _)
         # log.critical(err)
         qquit('CRITICAL', err)
     if not name:
         try:
             name = self.parse_blueprint_name(file_data)
             log.info("name not specified, determined blueprint name from file contents as '%s'" % name)
         except KeyError as _:
             pass
     if not name:
         name = os.path.splitext(os.path.basename(file))[0]
         log.info("name not specified and couldn't determine blueprint name from blueprint data, reverting to using filename without extension '%s'" % name) # pylint: disable=line-too-long
     # this solves the issue of having duplicate Blueprint.blueprint_name keys
     try:
         json_data = json.loads(file_data)
         json_data['Blueprints']['blueprint_name'] = name
         data = json.dumps(json_data)
         log.info("reset blueprint field name to '%s'" % name)
     except ValueError as _:
         qquit('CRITICAL', "invalid json found in file '%s': %s" % (file, name))
     except KeyError as _:
         log.warn('failed to reset the Blueprint name: %s' % _)
     return self.send_blueprint(name, data)
Exemplo n.º 6
0
 def create_cluster(self, cluster, filename, blueprint=''):
     # log.debug('create_cluster(%s, %s)' % (filename, name))
     validate_file(filename, 'cluster hosts mapping', nolog=True)
     try:
         _ = open(str(filename))
         file_data = _.read()
     except IOError as _:
         err = "failed to read Ambari cluster host mapping from file '%s': %s" % (filename, _)
         # log.critical(err)
         qquit('CRITICAL', err)
     log.info("creating cluster '%s' using file '%s'" % (cluster, filename))
     if not isJson(file_data):
         qquit('CRITICAL', "invalid json found in file '%s'" % filename)
     # don't have access to a blueprint name to enforce reset here
     # json_data = json.loads(file_data)
     # try:
     #     json_data['Blueprints']['blueprint_name'] = blueprint
     # except KeyError, e:
     #     qquit('CRITICAL', 'failed to (re)set blueprint name in cluster/hostmapping data before creating cluster')
     if blueprint:
         try:
             log.info("setting blueprint in cluster creation to '%s'" % blueprint)
             json_data = json.loads(file_data)
             json_data['blueprint'] = blueprint
             file_data = json.dumps(json_data)
         except KeyError as _:
             log.warn("failed to inject blueprint name '%s' in to cluster creation" % blueprint)
     response = self.send('clusters/%s' % cluster, file_data)
     log.info("Cluster creation submitted, see Ambari web UI to track progress")
     return response
    def run(self):
        self.no_args()
        host = self.get_opt('host')
        port = self.get_opt('port')
        table = self.get_opt('table')
        validate_host(host)
        validate_port(port)
        validate_database_tablename(table)

        # raises 500 error if table doesn't exist
        url = 'http://%(host)s:%(port)s/table.jsp?name=%(table)s' % locals()
        log.debug('GET %s', url)
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException as _:
            qquit('CRITICAL', _)
        log.debug("response: %s %s", req.status_code, req.reason)
        log.debug("content:\n%s\n%s\n%s", '='*80, req.content.strip(), '='*80)
        if req.status_code != 200:
            info = ''
            #if req.status_code == '500' and 'TableNotFoundException' in req.content:
            if 'TableNotFoundException' in req.content:
                info = 'table not found'
            qquit('CRITICAL', "%s %s %s" % (req.status_code, req.reason, info))
        is_table_compacting = self.parse_is_table_compacting(req.content)
        self.msg = 'HBase table \'{0}\' '.format(table)
        if is_table_compacting:
            self.warning()
            self.msg += 'has compaction in progress'
        else:
            self.msg += 'has no compaction in progress'
Exemplo n.º 8
0
 def parse(self, content):
     # could also collect lines after 'Regions-in-transition' if parsing /dump
     # sample:
     # hbase:meta,,1.1588230740 state=PENDING_OPEN, \
     # ts=Tue Nov 24 08:26:45 UTC 2015 (1098s ago), server=amb2.service.consul,16020,1448353564099
     soup = BeautifulSoup(content, 'html.parser')
     #if log.isEnabledFor(logging.DEBUG):
     #    log.debug("BeautifulSoup prettified:\n%s\n%s", soup.prettify(), '='*80)
     # looks like HMaster UI doesn't print this section if there are no regions in transition, must assume zero
     longest_rit_time = None
     try:
         headings = soup.findAll('h2')
         for heading in headings:
             log.debug("checking heading '%s'", heading)
             if heading.get_text() == "Regions in Transition":
                 log.debug('found Regions in Transition section header')
                 table = heading.find_next('table')
                 log.debug('checking first following table')
                 rows = table.findChildren('tr')
                 header_cols = rows[0].findChildren('th')
                 self.assert_headers(header_cols)
                 longest_rit_time = self.process_rows(rows)
                 return longest_rit_time
     except (AttributeError, TypeError):
         qquit('UNKNOWN', 'failed to parse HBase Master UI status page. %s' % support_msg())
 def parse_version(self, soup):
     version = None
     try:
         attributes_table = soup.find('table', {'id':'attributes_table'})
         rows = attributes_table.findAll('tr')
         num_rows = len(rows)
         self.sanity_check(num_rows > 5, 'too few rows ({0})'.format(num_rows))
         headers = rows[0].findAll('th')
         num_headers = len(headers)
         self.sanity_check(num_headers > 2, 'too few header columns ({0})'.format(num_headers))
         self.sanity_check(headers[0].text.strip() == 'Attribute Name',
                           'header first column does not match expected \'Attribute Name\'')
         self.sanity_check(headers[1].text.strip() == 'Value',
                           'header second column does not match expected \'Value\'')
         for row in rows:
             cols = row.findAll('td')
             num_cols = len(cols)
             if num_cols == 0:
                 continue
             self.sanity_check(num_cols > 2, 'too few columns ({0})'.format(num_cols))
             if cols[0].text.strip() == 'HBase Version':
                 version = cols[1].text.split(',')[0]
                 break
     except (AttributeError, TypeError):
         qquit('UNKNOWN', 'failed to find parse HBase output. {0}\n{1}'\
                          .format(support_msg(), traceback.format_exc()))
     # strip things like -hadoop2 at end
     version = version.split('-')[0]
     return version
 def get_tables(self):
     try:
         tables = self.conn.tables()
         if not isList(tables):
             qquit('UNKNOWN', 'table list returned is not a list! ' + support_msg_api())
     except (socket.error, socket.timeout, ThriftException, HBaseIOError) as _:
         qquit('CRITICAL', 'error while trying to get table list: {0}'.format(_))
Exemplo n.º 11
0
    def run(self):
        self.no_args()
        host = self.get_opt('host')
        port = self.get_opt('port')
        validate_host(host)
        validate_port(port)

        url = 'http://%(host)s:%(port)s/jmx' % locals()
        log.debug('GET %s', url)
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException as _:
            qquit('CRITICAL', _)
        log.debug("response: %s %s", req.status_code, req.reason)
        log.debug("content:\n%s\n%s\n%s", '='*80, req.content.strip(), '='*80)
        if req.status_code != 200:
            qquit('CRITICAL', "%s %s" % (req.status_code, req.reason))
        compaction_queue_size = self.parse(req.content)
        self.msg = 'HBase RegionServer compaction '
        if compaction_queue_size > 0:
            self.warning()
            self.msg += 'in progress'
        else:
            self.msg += 'not in progress'
        self.msg += ', compactionQueueSize = {0}'.format(compaction_queue_size)
        self.msg += ' | compactionQueueSize={0};0;0'.format(compaction_queue_size)
Exemplo n.º 12
0
 def assert_headers(header_cols):
     try:
         assert header_cols[0].get_text().strip() == 'Region'
         assert header_cols[1].get_text().strip() == 'State'
         assert header_cols[2].get_text().strip() == 'RIT time (ms)'
     except AssertionError as _:
         qquit('UNKNOWN', 'parsing failed, headers did not match expected - {0}'.format(_))
Exemplo n.º 13
0
 def connection(self, host, port, user, password, ssl=False, **kwargs):
     # must set X-Requested-By in newer versions of Ambari
     self.x_requested_by = user
     if user == 'admin':
         self.x_requested_by = os.getenv('USER', user)
     #log.info("contacting Ambari as '%s'" % self.user)
     if not isHost(host) or not isPort(port) or not isUser(user) or not password:
         raise InvalidOptionException('invalid options passed to AmbariBlueprint()')
     proto = 'http' # pylint: disable=unused-variable
     if ssl:
         proto = 'https'
     self.host = host
     self.port = port
     self.user = user
     self.password = password
     # if kwargs.has_key('strip_config') and kwargs['strip_config']:
     if 'strip_config' in kwargs and kwargs['strip_config']:
         self.strip_config = True
     self.url_base = '%(proto)s://%(host)s:%(port)s/api/v1' % locals()
     if 'dir' in kwargs and kwargs['dir']:
         self.blueprint_dir = kwargs['dir']
     if not isDirname(self.blueprint_dir):
         qquit('UNKNOWN', 'invalid dir arg passed to AmbariBlueprintTool')
     try:
         if not self.blueprint_dir or not os.path.exists(self.blueprint_dir):
             log.info("creating blueprint data dir '%s'" % self.blueprint_dir)
             os.mkdir(self.blueprint_dir)
         if not os.path.isdir(self.blueprint_dir):
             raise IOError("blueprint dir '%s'already taken and is not a directory" % self.blueprint_dir)
     except IOError as _:
         die("'failed to create dir '%s': %s" % (self.blueprint_dir, _))
Exemplo n.º 14
0
    def run(self):
        self.no_args()
        host = self.get_opt('host')
        port = self.get_opt('port')
        validate_host(host)
        validate_port(port)
        self.validate_thresholds(integer=False)

        url = 'http://%(host)s:%(port)s/master-status' % locals()
        log.debug('GET %s', url)
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException as _:
            qquit('CRITICAL', _)
        log.debug("response: %s %s", req.status_code, req.reason)
        log.debug("content:\n%s\n%s\n%s", '='*80, req.content.strip(), '='*80)
        if req.status_code != 200:
            qquit('CRITICAL', ("%s %s" % (req.status_code, req.reason)))
        self.parse_output(req.content)
        log.info('server with min regions = %s regions on %s', self.server_min_regions[1], self.server_min_regions[0])
        log.info('server with max regions = %s regions on %s', self.server_max_regions[1], self.server_max_regions[0])
        imbalance = self.calculate_imbalance()
        self.msg = '{0}% region imbalance'.format(imbalance)
        self.check_thresholds(imbalance)
        self.msg += ' between HBase RegionServers hosting the most vs least number of regions'
        self.msg += ' (min = {0}, max = {1})'.format(self.server_min_regions[1], self.server_max_regions[1])
        self.msg += " | '% region imbalance'={0}%".format(imbalance)
        self.msg += self.get_perf_thresholds()
        self.msg += ' min_regions={0} max_regions={1}'.format(self.server_min_regions[1], self.server_max_regions[1])
 def connect(self):
     log.info('connecting to HBase Thrift Server at %s:%s', self.host, self.port)
     try:
         # cast port to int to avoid low level socket module TypeError for ports > 32000
         self.conn = happybase.Connection(host=self.host, port=int(self.port), timeout=10 * 1000)  # ms
     except (socket.error, socket.timeout, ThriftException, HBaseIOError) as _:
         qquit('CRITICAL', 'error connecting: {0}'.format(_))
Exemplo n.º 16
0
 def process_rows(rows):
     longest_rit_time = None
     # will skip header anyway when it doesn't find td (will contain th instead)
     # this will avoid accidentally skipping a row later if the input changes to rows[1:] instead of rows
     #for row in rows[1:]:
     for row in rows:
         print(row)
         cols = row.findChildren('td')
         # Regions in Transition rows only have 2 cols
         # <hex> region rows have Region, State, RIT time (ms)
         num_cols = len(cols)
         if num_cols == 0:
             # header row
             continue
         elif num_cols != 3:
             qquit('UNKNOWN', 'unexpected number of columns ({0}) '.format(num_cols)
                   + 'for regions in transition table. ' + support_msg())
         if 'Regions in Transition' in cols[0].get_text():
             continue
         rit_time = cols[2].get_text().strip()
         if not isInt(rit_time):
             qquit('UNKNOWN', 'parsing failed, got region in transition time of ' +
                   "'{0}', expected integer".format(rit_time))
         rit_time = int(rit_time)
         if rit_time > longest_rit_time:
             longest_rit_time = rit_time
     return longest_rit_time
Exemplo n.º 17
0
 def main(self):
     try:
         # Python 2.x
         super(NagiosPlugin, self).main()
         # Python 3.x
         # super().__init__()
         # redirect_stderr_stdout()
     except CriticalError as _:
         qquit('CRITICAL', _)
     except WarningError as _:
         qquit('WARNING', _)
     except UnknownError as _:
         qquit('UNKNOWN', _)
     except CodingError as _:
         qquit('UNKNOWN', 'Programming Error: {0}. {1}'.format(_, support_msg()))
     except Exception as _:  # pylint: disable=broad-except
         exception_type = type(_).__name__
         if log.isEnabledFor(logging.DEBUG):
             log.debug("exception: '%s'", exception_type)
             log.debug(traceback.format_exc())
         msg = 'Nagios Plugin Exception: {exception_type}: {msg}'.format(exception_type=exception_type, msg=self.exception_msg())
         #msg = ', '.join([x.strip() for x in msg.split('\n')])
         # ', ' doesn't look nice for ':\n ...' => ':, ...' (snakebite OutOfNNException)
         #msg = '\t'.join([x.strip() for x in msg.split('\n')])
         #if self.options.verbose > 2:
         #    msg = type(_).__name__ + ': ' + msg
         msg += '. ' + support_msg()
         qquit('UNKNOWN', msg)
 def get_ingestions(self, num=None, filter_opts=None):
     log.info('getting ingestion history')
     if num:
         chunk_size = num
         log.info('explicit number of results requested: %s', chunk_size)
     elif filter_opts:
         chunk_size = 10
         log.info('filters detected, defaulting number of results to %s', chunk_size)
     else:
         chunk_size = 100
         log.info('using catch all default result limit of %s', chunk_size)
     settings = {'chunkSize': chunk_size, 'currentPage': 1}
     if filter_opts is not None:
         if not isDict(filter_opts):
             code_error('passed non-dictionary for filter opts to get_ingestions')
         for key, value in sorted(filter_opts.items()):
             log.info("filter: '%s' = '%s'", key, value)
         settings = merge_dicts(settings, filter_opts)
     log.info('settings: %s', settings)
     log.info('querying Zaloni for ingestion history')
     (req, self.query_time) = self.req(url='{url_base}/ingestion/publish/getFileIndex'
                                       .format(url_base=self.url_base),
                                       # orders by newest first, but seems to return last 10 anyway
                                       body=json.dumps(settings))
     try:
         log.info('parsing JSON response')
         json_dict = json.loads(req.content)
     except ValueError as _:
         qquit('UNKNOWN', 'error parsing json returned by Zaloni: {0}'.format(_))
     return json_dict
Exemplo n.º 19
0
 def run(self):
     log.info("querying %s", self.software)
     url = "{protocol}://{host}:{port}/PolicyManagement/{api_version}/deployments".format(
         host=self.host, port=self.port, api_version=self.api_version, protocol=self.protocol
     )
     log.debug("GET %s", url)
     try:
         req = requests.get(url, auth=HTTPBasicAuth(self.user, self.password))
     except requests.exceptions.RequestException as _:
         errhint = ""
         if "BadStatusLine" in str(_.message):
             errhint = " (possibly connecting to an SSL secured port without using --ssl?)"
         elif self.protocol == "https" and "unknown protocol" in str(_.message):
             errhint = " (possibly connecting to a plain HTTP port with the -S / --ssl switch enabled?)"
         qquit("CRITICAL", str(_) + errhint)
     log.debug("response: %s %s", req.status_code, req.reason)
     log.debug("content:\n%s\n%s\n%s", "=" * 80, req.content.strip(), "=" * 80)
     if req.status_code == 400 and req.reason == "Bad Request":
         qquit(
             "CRITICAL",
             "{0}: {1} (possibly new install with no deployments yet?)".format(req.status_code, req.reason),
         )
     if req.status_code != 200:
         qquit("CRITICAL", "{0}: {1}".format(req.status_code, req.reason))
     try:
         json_list = json.loads(req.content)
         if log.isEnabledFor(logging.DEBUG):
             print(jsonpp(json_list))
             print("=" * 80)
         if not isList(json_list):
             raise ValueError("returned content is not a list")
         if not json_list:
             qquit("UNKNOWN", "no deployments found")
         last_deployment = json_list[0]
         userid = last_deployment["UserId"]
         description = last_deployment["Description"]
         hostname = last_deployment["HostName"]
         timestamp = last_deployment["timestamp"]
         last_deploy_datetime = datetime.strptime(timestamp, "%b %d, %Y %H:%M:%S %p")
     except (KeyError, ValueError) as _:
         qquit(
             "UNKNOWN",
             "error parsing output from {software}: {exception}: {error}. {support_msg}".format(
                 software=self.software, exception=type(_).__name__, error=_, support_msg=support_msg_api()
             ),
         )
     timedelta = datetime.now() - last_deploy_datetime
     mins = int(int(timedelta.total_seconds()) / 60)
     self.msg = "{software} last deployment was at '{timestamp}', {mins} mins ago".format(
         software=self.software, timestamp=timestamp, mins=mins
     )
     self.check_thresholds(mins)
     if self.verbose:
         self.msg += " by user '{userid}', host = '{hostname}', description = '{description}'".format(
             userid=userid, hostname=hostname, description=description
         )
     self.msg += " | mins_since_last_deployment={mins}{thresholds}".format(
         mins=mins, thresholds=self.get_perf_thresholds(boundary="lower")
     )
Exemplo n.º 20
0
 def __end__(self):
     super(NagiosPlugin, self).__end__()
     # enabling this would break existing PNP4Nagios data due to the change in num perfdata fields
     #if '|' not in self.msg:
     #    self.msg += ' |'
     #self.msg += ' check_time={0:.2f}s'.format(CLI.__total_plugin_time)
     log.info('end\n%s\n', '='*80)
     qquit(self.status, self.msg)
Exemplo n.º 21
0
 def timeout_handler(self, signum, frame): # pylint: disable=unused-argument
     # problem with this is that it'll print and then the exit exception will be caught and quit() printed again
     # raising a custom TimeoutException will need to be handled in main, but that would also likely print and be
     # re-caught and re-printed by NagiosPlugin
     #print('self timed out after %d second%s' % (self.timeout, plural(self.timeout)))
     #sys.exit(ERRORS['UNKNOWN'])
     # if doing die the same thing same will happen since die is a custom func which prints and then calls exit,
     # only exit would be caught
     qquit('UNKNOWN', 'self timed out after %d second%s' % (self.timeout, plural(self.timeout)))
Exemplo n.º 22
0
 def save_all(self):
     log.info('finding all blueprints and clusters to blueprint')
     blueprints = self.get_blueprints()
     clusters = self.get_clusters()
     if not blueprints and not clusters:
         qquit('UNKNOWN', 'no Ambari Blueprints or Clusters found on server')
     for blueprint in blueprints:
         self.save_blueprint(blueprint)
     for cluster in clusters:
         self.save_cluster(cluster)
 def get_timedelta(ingestion_date):
     ingestion_date = str(ingestion_date).strip()
     invalid_ingestion_dates = ('', 'null', 'None', None)
     if ingestion_date not in invalid_ingestion_dates:
         try:
             # parsing the date will break notifying us if the API format changes in future
             # whereas if millis changes to secs or similar we could be way off
             ingestion_datetime = datetime.strptime(ingestion_date, '%Y-%m-%d %H:%M:%S.%f')
         except ValueError as _:
             qquit('UNKNOWN', 'error parsing ingestion date time format: {0}'.format(_))
     time_delta = datetime.now() - ingestion_datetime
     return time_delta
Exemplo n.º 24
0
 def end(self):
     if self.node_count is None:
         raise UnknownError('node count is not set!')
     self.msg = '{0} {1}{2} {3}'.format(self.node_count, self.agent_name, plural(self.node_count), self.state)
     self.check_thresholds(self.node_count)
     if self.additional_info:
         self.msg += ', {0}'.format(self.additional_info)
     self.msg += ' | {0}s_{1}={2:d}s{3}'.format(self.agent_name, self.state,
                                                self.node_count, self.get_perf_thresholds())
     if self.additional_perfdata:
         self.msg += ' {0}'.format(self.additional_perfdata)
     qquit(self.status, self.msg)
Exemplo n.º 25
0
 def list(self, url_suffix):
     self.url = self.url_base + '/' + url_suffix
     try:
         response = self.get(url_suffix)
     except requests.exceptions.RequestException as _:
         err = 'failed to fetch list of Ambari Blueprints: %s' % _
         # log.critical(err)
         qquit('CRITICAL', err)
     json_data = json.loads(response)
     if log.isEnabledFor(logging.DEBUG):
         log.debug("json_data = " + jsonpp(json_data))
     return json_data
Exemplo n.º 26
0
 def check_table(self):
     log.info('checking table \'%s\'', self.table)
     if not self.conn.is_table_enabled(self.table):
         qquit('CRITICAL', "table '{0}' is disabled!".format(self.table))
     table_conn = self.conn.table(self.table)
     families = table_conn.families()
     self.num_column_families = len(families)
     log.info('found %s column families: %s', self.num_column_families, families)
     for column_family in sorted(families):
         column = '{0}:{1}'.format(column_family, self.column_qualifier)
         self.check_write(table_conn, self.row, column)
         self.check_read(table_conn, self.row, column, self.value)
         self.check_delete(table_conn, self.row, column)
 def run(self):
     try:
         if self.get_opt('list_metrics'):
             self.list_metrics()
         json_struct = self.get('lastdata', params={'metrics': self.metrics})
         metrics = self.parse_metrics(json_struct)
         self.msg_metrics(metrics)
     except (KeyError, ValueError) as _:
         qquit('UNKNOWN', 'error parsing output from {software}: {exception}: {error}. {support_msg}'\
                          .format(software=self.software,
                                  exception=type(_).__name__,
                                  error=_,
                                  support_msg=support_msg_api()))
    def run(self):
        self.no_args()
        host = self.options.host
        port = self.options.port
        validate_host(host)
        validate_port(port)

        url = 'http://%(host)s:%(port)s/oozie/v1/admin/status' % locals()
        log.debug('GET %s' % url)
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException as _:
            qquit('CRITICAL', _)
        log.debug("response: %s %s" % (req.status_code, req.reason))
        log.debug("content: '%s'" % req.content)
        if req.status_code != 200:
            qquit('CRITICAL', "Non-200 response! %s %s" % (req.status_code, req.reason))
        # should == NORMAL
        if not isJson(req.content):
            qquit('UNKNOWN', 'non-JSON returned by Oozie server at {0}:{1}'.format(host, port))
        status = None
        try:
            _ = json.loads(req.content)
            status = _['systemMode']
        except KeyError:
            qquit('UNKNOWN', 'systemMode key was not returned in output from Oozie. {0}'.format(support_msg_api()))
        self.msg = 'Oozie status = {0}'.format(status)
        if status == 'NORMAL':
            self.ok()
        else:
            self.critical()
Exemplo n.º 29
0
    def run(self):
        self.no_args()
        host = self.get_opt('host')
        port = self.get_opt('port')
        validate_host(host)
        validate_port(port)

        # observed bug in HDP 2.3 (HBase 1.1.2) where the JMX metric from HMaster UI /jmx is displaying 0 for beans
        # [ {"name":"Hadoop:service=HBase,name=Master,sub=AssignmentManger", ..., "ritCountOverThreshold" : 0 }
        # https://issues.apache.org/jira/browse/HBASE-16636
        #url = 'http://%(host)s:%(port)s/jmx' % locals()
        # could get info from flat txt debug page but it doesn't contain the summary count
        #url = 'http://%(host)s:%(port)s/dump' % locals()
        url = 'http://%(host)s:%(port)s/master-status' % locals()
        log.debug('GET %s', url)
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException as _:
            qquit('CRITICAL', _)
        log.debug("response: %s %s", req.status_code, req.reason)
        log.debug("content:\n%s\n%s\n%s", '='*80, req.content.strip(), '='*80)
        if req.status_code != 200:
            qquit('CRITICAL', "%s %s" % (req.status_code, req.reason))
        regions_stuck_in_transition = self.parse(req.content)
        if regions_stuck_in_transition is None:
            qquit('UNKNOWN', 'parse error - failed to find number for regions stuck in transition')
        if not isInt(regions_stuck_in_transition):
            qquit('UNKNOWN', 'parse error - got non-integer for regions stuck in transition when parsing HMaster UI')
        if regions_stuck_in_transition == 0:
            self.ok()
        else:
            self.critical()
        self.msg = '{0} regions stuck in transition (ie. transitioning longer than HBase threshold)'\
                   .format(regions_stuck_in_transition)
        self.msg += " | regions_stuck_in_transition={0};0;0".format(regions_stuck_in_transition)
 def parse(self, req):
     soup = BeautifulSoup(req.content, 'html.parser')
     # if log.isEnabledFor(logging.DEBUG):
     #     log.debug("BeautifulSoup prettified:\n%s\n%s", soup.prettify(), '='*80)
     status = None
     try:
         status = soup.find('div', {'class': 'alert alert-success'}).get_text().strip()
     except (AttributeError, TypeError):
         qquit('UNKNOWN', 'failed to parse Apache Drill status page. %s' % support_msg())
     if re.match('Running!?$', status):
         self.ok()
     else:
         self.critical()
     return status
Exemplo n.º 31
0
    def run(self):
        self.no_args()
        host = self.get_opt('host')
        port = self.get_opt('port')
        validate_host(host)
        validate_port(port)
        self.validate_thresholds()

        # observed bug in HDP 2.3 (HBase 1.1.2) where the JMX metric from HMaster UI /jmx is displaying 0 for beans
        # [ {"name":"Hadoop:service=HBase,name=Master,sub=AssignmentManger", ..., "ritCountOverThreshold" : 0 }
        # https://issues.apache.org/jira/browse/HBASE-16636
        #url = 'http://%(host)s:%(port)s/jmx' % locals()
        # could get info from flat txt debug page but it doesn't contain the summary count
        #url = 'http://%(host)s:%(port)s/dump' % locals()
        url = 'http://%(host)s:%(port)s/master-status' % locals()
        log.debug('GET %s', url)
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException as _:
            qquit('CRITICAL', _)
        log.debug("response: %s %s", req.status_code, req.reason)
        log.debug("content:\n%s\n%s\n%s", '=' * 80, req.content.strip(),
                  '=' * 80)
        if req.status_code != 200:
            qquit('CRITICAL', "%s %s" % (req.status_code, req.reason))
        regions_in_transition = self.parse(req.content)
        if regions_in_transition is None:
            qquit(
                'UNKNOWN',
                'parse error - failed to find number for regions in transition'
            )
        if not isInt(regions_in_transition):
            qquit(
                'UNKNOWN',
                'parse error - got non-integer for regions in transition when parsing HMaster UI'
            )
        if regions_in_transition == 0:
            self.ok()
        else:
            self.critical()
        self.msg = '{0} regions in transition'\
                   .format(regions_in_transition)
        self.check_thresholds(regions_in_transition)
        self.msg += " | regions_in_transition={0}".format(
            regions_in_transition)
        self.msg += self.get_perf_thresholds()
Exemplo n.º 32
0
 def check_ingestion(self, num, filter_opts=None, max_age=None, max_runtime=None):
     log.info('checking ingestion history')
     json_dict = self.get_ingestions(num, filter_opts)
     info = ''
     if self.verbose:
         for key in sorted(filter_opts):
             info += " {0}='{1}'".format(key, filter_opts[key])
     try:
         results = json_dict['result']
         if not results:
             qquit('CRITICAL', "no results found for ingestion{0}"\
                   .format('{0}. {1}'.format(info, self.extract_response_message(json_dict)) + \
                   'Perhaps you specified incorrect filters? Use --list to see existing ingestions'))
         num_results = len(results)
         log.info('%s ingestion history results returned', num_results)
         self.check_statuses(results)
         if num:
             self.msg += ' out of last {0} ingest{1}'.format(num_results, plural(num_results))
         if self.history_mins:
             self.msg += ' within last {0} ({1} min{2})'.format(sec2human(self.history_mins * 60),
                                                                str(self.history_mins).rstrip('0').rstrip('.'),
                                                                plural(self.history_mins))
         longest_incomplete_timedelta = self.check_longest_incomplete_ingest(results, max_runtime)
         age_timedelta_secs = self.check_last_ingest_age(results, max_age=max_age)
         self.msg_filter_details(filter_opts=filter_opts)
         self.msg += ' |'
         self.msg += ' last_ingest_age={0}s;{1}'.format(age_timedelta_secs,
                                                        max_age * 3600 if max_age else '')
         self.msg += ' longest_incomplete_ingest_age={0}s;{1}'\
                     .format(self.timedelta_seconds(longest_incomplete_timedelta)
                             if longest_incomplete_timedelta else 0,
                             max_age * 3600 if max_age else '')
         self.msg += ' auth_time={auth_time}s query_time={query_time}s'.format(auth_time=self.auth_time,
                                                                               query_time=self.query_time)
     except KeyError as _:
         qquit('UNKNOWN', 'error parsing workflow execution history: {0}'.format(_))
Exemplo n.º 33
0
 def fetch(self, url_suffix):
     err = ''
     try:
         response = self.get(url_suffix)
     except requests.exceptions.RequestException as _:
         err = "failed to fetch Ambari Blueprint from '%s': %s" % (self.url, _)
         # log.critical(err)
         qquit('CRITICAL', err)
     json_data = json.loads(response)
     if log.isEnabledFor(logging.DEBUG):
         log.debug("blueprint = " + jsonpp(json_data))
     try:
         del json_data['href']
         log.debug("stripped href as it's not valid if re-submitting the blueprint to Ambari")
     except KeyError as _:
         pass
     # Ambari 2.1.3 supports this according to:
     # https://cwiki.apache.org/confluence/display/AMBARI/Blueprints#Blueprints-ClusterCreationTemplateStructure
     # json_data['config_recommendation_strategy'] = 'NEVER_APPLY' # default
     # json_data['config_recommendation_strategy'] = 'ONLY_STACK_DEFAULTS_APPLY'
     # json_data['config_recommendation_strategy'] = 'ALWAYS_APPLY'
     if self.strip_config:
         log.info('stripping out config sections of blueprints to make more generic')
         try:
             del json_data['configurations']
             for hostgroup in json_data['host_groups']:
                 del hostgroup['configurations']
         except KeyError as _:
             pass
     try:
         json_data['host_groups'] = list_sort_dicts_by_value(json_data['host_groups'], 'name')
         for hostgroup in json_data['host_groups']:
             hostgroup['components'] = list_sort_dicts_by_value(hostgroup['components'], 'name')
     except KeyError as _:
         qquit('CRITICAL', 'failed to sort blueprint: %s' % _)
     return jsonpp(json_data)
 def get_latest_build(self, content):
     build = None
     builds = json.loads(content)
     if not builds:
         qquit(
             'UNKNOWN', "no Travis CI builds returned by the Travis API." +
             " Either the specified repo '{0}' doesn't exist".format(
                 self.repo) + " or no builds have happened yet?" +
             " Also remember the repo is case sensitive, for example 'harisekhon/nagios-plugins' returns this"
             +
             " blank build set whereas 'HariSekhon/nagios-plugins' succeeds"
             + " in returning latest builds information")
     # get latest finished build
     for _ in builds:
         if _['state'] == 'finished':
             if build is None:
                 build = _
         else:
             self.builds_in_progress += 1
     if build is None:
         qquit('UNKNOWN', 'no recent builds finished yet')
     if log.isEnabledFor(logging.DEBUG):
         log.debug("latest build:\n{0}".format(jsonpp(build)))
     return build
Exemplo n.º 35
0
Arquivo: cli.py Projeto: smutel/pylib
 def main(self):
     # log.debug('running main()')
     log.setLevel(logging.WARN)
     self.setup()
     try:
         self.add_options()
         self.add_default_opts()
     except InvalidOptionException as _:
         self.usage(_)
     try:
         self.__parse_args__()
         # broken
         # autoflush()
         # too late
         # os.environ['PYTHONUNBUFFERED'] = "anything"
         self.verbose = self.get_opt('verbose')
         if self.is_option_defined('quiet') and self.get_opt('quiet'):
             self.verbose = 0
         elif self.verbose > 2:
             log.setLevel(logging.DEBUG)
         elif self.verbose > 1:
             log.setLevel(logging.INFO)
         elif self.verbose > 0 and self._prog[0:6] != 'check_':
             log.setLevel(logging.WARN)
         if self.options.debug:
             log.setLevel(logging.DEBUG)  # pragma: no cover
             log.debug('enabling debug logging')
             if self.verbose < 3:
                 self.verbose = 3
         log.info('Hari Sekhon %s', self.version)
         log.info(self._github_repo)
         log.info('verbose level: %s (%s)', self.verbose, logging.getLevelName(log.getEffectiveLevel()))
         if self.timeout is not None:
             validate_int(self.timeout, 'timeout', 0, self.timeout_max)
             log.debug('setting timeout alarm (%s)', self.timeout)
             signal.signal(signal.SIGALRM, self.timeout_handler)
             signal.alarm(int(self.timeout))
         # if self.options.version:
         #     print(self.version)
         #     sys.exit(ERRORS['UNKNOWN'])
         self.process_options()
         self.process_args()
         try:
             self.run()
         except CriticalError as _:
             qquit('CRITICAL', _)
         except WarningError as _:
             qquit('WARNING', _)
         except UnknownError as _:
             qquit('UNKNOWN', _)
         self.__end__()
     except InvalidOptionException as _:
         self.usage(_)  # pragma: no cover
     except KeyboardInterrupt:
         # log.debug('Caught control-c...')
         print('Caught control-c...')  # pragma: no cover
 def parse_output(self, content):
     soup = BeautifulSoup(content, 'html.parser')
     if log.isEnabledFor(logging.DEBUG):
         log.debug("BeautifulSoup prettified:\n{0}\n{1}".format(soup.prettify(), '='*80))
     # shorter to just catch NoneType attribute error when tag not found and returns None
     try:
         basestats = soup.find('div', {'id': 'tab_baseStats'})
         table = basestats.find('table')
         #for table in basestats:
         rows = table.findAll('tr')
         headers = rows[0].findAll('th')
         header_server = headers[0].text
         # HBase 1.1 in HDP 2.3: ServerName | Start time | Requests Per Second | Num. Regions
         # HBase 1.2 (Apache):   ServerName | Start time | Version | Requests per Second | Num. Regions
         # HBase 1.4 (Apache):   ServerName | Start time | Last Contact | Version | Requests Per Second | Num. Regions
         num_regions_index = len(headers) - 1
         header_num_regions = headers[num_regions_index].text
         if header_server != 'ServerName':
             qquit('UNKNOWN', "Table headers in Master UI have changed" +
                   " (got {0}, expected 'ServerName'). ".format(header_server) + support_msg())
         if header_num_regions != 'Num. Regions':
             qquit('UNKNOWN', "Table headers in Master UI have changed" +
                   " (got {0}, expected 'Num. Regions'). ".format(header_num_regions) + support_msg())
         log.debug('%-50s\tnum_regions', 'server')
         for row in rows[1:]:
             # this can be something like:
             # 21689588ba40,16201,1473775984259
             # so don't apply isHost() validation because it'll fail FQDN / IP address checks
             cols = row.findAll('td')
             server = cols[0].text
             if self.total_regex.match(server):
                 continue
             num_regions = cols[num_regions_index].text
             if not isInt(num_regions):
                 qquit('UNKNOWN', "parsing error - got '{0}' for num regions".format(num_regions) +
                       " for server '{1}', was expecting integer.".format(server) +
                       " UI format must have changed" + support_msg())
             num_regions = int(num_regions)
             log.debug('%-50s\t%s', server, num_regions)
             if self.server_min_regions[1] is None or num_regions < self.server_min_regions[1]:
                 self.server_min_regions = (server, num_regions)
             if self.server_max_regions[1] is None or num_regions > self.server_max_regions[1]:
                 self.server_max_regions = (server, num_regions)
     except (AttributeError, TypeError, IndexError):
         qquit('UNKNOWN', 'failed to find parse output')
    def run(self):
        self.no_args()
        host = self.get_opt('host')
        port = self.get_opt('port')
        validate_host(host)
        validate_port(port)

        log.info('querying Tachyon Master')
        url = 'http://%(host)s:%(port)s/workers' % locals()
        log.debug('GET %s' % url)
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException as _:
            qquit('CRITICAL', _)
        log.debug("response: %s %s" % (req.status_code, req.reason))
        log.debug("content:\n{0}\n{1}\n{2}".format('=' * 80,
                                                   req.content.strip(),
                                                   '=' * 80))
        if req.status_code != 200:
            qquit('CRITICAL',
                  "Non-200 response! %s %s" % (req.status_code, req.reason))
        soup = BeautifulSoup(req.content, 'html.parser')
        dead_workers = 0
        try:
            dead_workers = len([
                _ for _ in soup.find(id='data2').find('tbody').find_all('tr')
                if _
            ])
        except (AttributeError, TypeError):
            qquit(
                'UNKNOWN',
                'failed to find parse Tachyon Master info for dead workers' %
                self.__dict__)
        try:
            dead_workers = int(dead_workers)
        except (ValueError, TypeError):
            qquit(
                'UNKNOWN',
                'Tachyon Master dead workers parsing returned non-integer: {0}'
                .format(dead_workers))
        self.msg = 'Tachyon dead workers = {0}'.format(dead_workers)  # pylint: disable=attribute-defined-outside-init
        self.ok()
        # TODO: thresholds on number of dead workers (coming soon)
        if dead_workers:
            self.critical()
Exemplo n.º 38
0
 def get_version(self):
     content = self.get()
     try:
         json_list = json.loads(content)
         if log.isEnabledFor(logging.DEBUG):
             print(jsonpp(json_list))
             print('='*80)
         if not isList(json_list):
             raise ValueError("non-list returned by API (is type '{0}')".format(type(json_list)))
         json_dict = json_list[0]
         if not isDict(json_dict):
             raise ValueError("non-dict found inside returned list (is type '{0}')".format(type(json_dict)))
         company_name = json_dict['company_name']
         company_website = json_dict['company_website']
         regex = re.compile(r'Blue\s*Talon', re.I)
         if not regex.match(company_name) and \
            not regex.match(company_website):
             qquit('UNKNOWN', 'Blue Talon name was not found in either company_name or company_website fields' \
                            + ', are you definitely querying a Blue Talon server?')
         build_version = json_dict['build_version']
         update_date = json_dict['update_date']
         api_version = json_dict['api_version']
         if not isVersion(api_version):
             qquit('UNKNOWN', '{0} api version unrecognized \'{1}\'. {2}'\
                              .format(self.software, api_version, support_msg_api()))
         if api_version != self.api_version:
             qquit('UNKNOWN', "unexpected API version '{0}' returned (expected '{1}')"\
                              .format(api_version, self.api_version))
         if self.verbose:
             extra_info = ' revision {revision} build {build}, schema revision = {schema_revision}'\
                           .format(revision=json_dict['revision_no'],
                                   build=json_dict['build_no'],
                                   schema_revision=json_dict['schema_revision'])
             extra_info += ', api version = {api_version}, update date = {update_date}'\
                           .format(api_version=api_version, update_date=update_date)
         else:
             extra_info = ', update date = {update_date}'.format(update_date=update_date)
     except (KeyError, ValueError) as _:
         qquit('UNKNOWN', 'error parsing output from {software}: {exception}: {error}. {support_msg}'\
                          .format(software=self.software,
                                  exception=type(_).__name__,
                                  error=_,
                                  support_msg=support_msg_api()))
     return (build_version, extra_info)
 def check_workflow(self, workflow_name, workflow_id):
     log.info("checking workflow '%s' id '%s'", workflow_name, workflow_id)
     # GET /workflow/fetchWorkflowStatus/<instance_id> is also available but only uses wfId, doesn't support wfName
     # returns ['result']['list'] = [ {}, {}, ... ]
     (req, self.query_time) = self.req(
         url='{url_base}/workflow/publish/getWorkflowExecutionHistory'.
         format(url_base=self.url_base),
         # orders by newest first, but seems to return last 10 anyway
         body=json.dumps({
             'chunk_size': 1,
             'currentPage': 1,
             'wfName': workflow_name,
             'wfId': workflow_id
         }))
     info = ''
     if workflow_name:
         info += " name '{0}'".format(workflow_name)
     if workflow_id:
         info += " id '{0}'".format(workflow_id)
     try:
         json_dict = json.loads(req.content)
         result = json_dict['result']
         not_found_err = '{0}. {1}'.format(info, self.extract_response_message(json_dict)) + \
                         'Perhaps you specified the wrong name/id or the workflow hasn\'t run yet? ' + \
                         'Use --list to see existing workflows'
         if result is None:
             if self._all:
                 return None
             qquit('CRITICAL',
                   "no results found for workflow{0}".format(not_found_err))
         reports = result['jobExecutionReports']
         if not isList(reports):
             raise ValueError('jobExecutionReports is not a list')
         if not reports:
             qquit('CRITICAL',
                   "no reports found for workflow{0}".format(not_found_err))
         # orders by newest first by default, checking last run only
         report = self.get_latest_complete_report(reports)
         status = report['status']
         if status == 'SUCCESS':
             pass
         elif status == 'INCOMPLETE':
             self.warning()
         else:
             self.critical()
         self.msg += "workflow '{workflow}' id '{id}' status = '{status}'".format(
             workflow=report['wfName'], id=report['wfId'], status=status)
         if not self._all:
             self.check_times(report['startDate'], report['endDate'])
         return status
     except (KeyError, ValueError) as _:
         qquit('UNKNOWN',
               'error parsing workflow execution history: {0}'.format(_))
Exemplo n.º 40
0
    def run(self):
        self.no_args()
        self.host = self.get_opt('host')
        self.port = self.get_opt('port')
        self.table = self.get_opt('table')
        validate_host(self.host)
        validate_port(self.port)
        validate_database_tablename(self.table, 'hbase')
        try:
            log.info('connecting to HBase Thrift Server at %s:%s', self.host,
                     self.port)
            # cast port to int to avoid low level socket module TypeError for ports > 32000
            self.conn = happybase.Connection(host=self.host,
                                             port=int(self.port),
                                             timeout=10 * 1000)  # ms
        except (socket.error, socket.timeout, ThriftException,
                HBaseIOError) as _:
            qquit('CRITICAL', 'error connecting: {0}'.format(_))
        if self.get_opt('list'):
            tables = self.get_tables()
            print('HBase Tables:\n\n' + '\n'.join(tables))
            sys.exit(ERRORS['UNKNOWN'])
        log.info('checking table \'%s\'', self.table)
        is_enabled = None
        try:
            is_enabled = self.conn.is_table_enabled(self.table)
        except HBaseIOError as _:
            #if 'org.apache.hadoop.hbase.TableNotFoundException' in _.message:
            if 'TableNotFoundException' in _.message:
                qquit('CRITICAL',
                      'table \'{0}\' does not exist'.format(self.table))
            else:
                qquit('CRITICAL', _)
        except (socket.error, socket.timeout, ThriftException) as _:
            qquit('CRITICAL', _)

        if not is_enabled:
            self.critical()
        self.msg = 'HBase table \'{0}\' enabled = {1}'.format(
            self.table, is_enabled)
        log.info('finished, closing connection')
        self.conn.close()
Exemplo n.º 41
0
    def run(self):
        self.no_args()
        host = self.get_opt('host')
        port = self.get_opt('port')
        validate_host(host)
        validate_port(port)

        log.info('querying Tachyon Master')
        url = 'http://%(host)s:%(port)s/home' % locals()
        log.debug('GET %s' % url)
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException as _:
            qquit('CRITICAL', _)
        log.debug("response: %s %s" % (req.status_code, req.reason))
        log.debug("content:\n{0}\n{1}\n{2}".format('=' * 80,
                                                   req.content.strip(),
                                                   '=' * 80))
        if req.status_code != 200:
            qquit('CRITICAL',
                  "Non-200 response! %s %s" % (req.status_code, req.reason))
        soup = BeautifulSoup(req.content, 'html.parser')
        try:
            running_workers = soup.find('th', text=re.compile(r'Running\s+Workers:?', re.I))\
                .find_next_sibling().get_text()
        except (AttributeError, TypeError):
            qquit(
                'UNKNOWN',
                'failed to find parse Tachyon Master info for running workers'
                % self.__dict__)
        try:
            running_workers = int(running_workers)
        except (ValueError, TypeError):
            qquit(
                'UNKNOWN',
                'Tachyon Master live workers parsing returned non-integer: {0}'
                .format(running_workers))
        self.msg = 'Tachyon running workers = {0}'.format(running_workers)  # pylint: disable=attribute-defined-outside-init
        self.ok()
        # TODO: thresholds on number of live workers (coming soon)
        if running_workers < 1:
            self.critical()
Exemplo n.º 42
0
 def run(self):
     log.info('querying %s', self.software)
     url = '{protocol}://{host}:{port}/PolicyManagement/{api_version}/resources'\
           .format(host=self.host, port=self.port, api_version=self.api_version, protocol=self.protocol)
     log.debug('GET %s', url)
     try:
         req = requests.get(url, auth=HTTPBasicAuth(self.user, self.password))
     except requests.exceptions.RequestException as _:
         errhint = ''
         if 'BadStatusLine' in str(_.message):
             errhint = ' (possibly connecting to an SSL secured port without using --ssl?)'
         elif self.protocol == 'https' and 'unknown protocol' in str(_.message):
             errhint = ' (possibly connecting to a plain HTTP port with the -S / --ssl switch enabled?)'
         qquit('CRITICAL', str(_) + errhint)
     log.debug("response: %s %s", req.status_code, req.reason)
     log.debug("content:\n%s\n%s\n%s", '='*80, req.content.strip(), '='*80)
     if req.status_code != 200:
         qquit('CRITICAL', '{0}: {1}'.format(req.status_code, req.reason))
     try:
         json_dict = json.loads(req.content)
         if log.isEnabledFor(logging.DEBUG):
             print(jsonpp(json_dict))
             print('='*80)
         if not isDict(json_dict):
             raise ValueError("non-dict returned by Blue Talon API (got type '{0}')".format(type(json_dict)))
         resource_domains_list = json_dict['resource_domains']
         if not isList(resource_domains_list):
             raise ValueError("non-list returned for 'resource_domains' key by Blue Talon API (got type '{0}')"\
                              .format(type(resource_domains_list)))
         num_resource_domains = len(resource_domains_list)
         num_resources = 0
         for resource_domain in resource_domains_list:
             resources = resource_domain['resources']
             if not isList(resources):
                 raise ValueError("non-list found for resources in resource_domain '{0}' (got type '{1}'"\
                                  .format(resource_domain['resource_domain_name'], type(resources)))
             num_resources += len(resources)
         self.msg += '{num_resources} resources'.format(num_resources=num_resources)
         self.check_thresholds(num_resources)
         self.msg += ' across {num_resource_domains} resource domains'\
                     .format(num_resource_domains=num_resource_domains)
         self.msg += ' | num_resources={num_resources}{perf} num_resource_domains={num_resource_domains}'\
                     .format(num_resources=num_resources,
                             num_resource_domains=num_resource_domains,
                             perf=self.get_perf_thresholds())
     except (KeyError, ValueError) as _:
         qquit('UNKNOWN', 'error parsing output from {software}: {exception}: {error}. {support_msg}'\
                          .format(software=self.software,
                                  exception=type(_).__name__,
                                  error=_,
                                  support_msg=support_msg_api()))
Exemplo n.º 43
0
    def run(self):
        self.no_args()
        host = self.get_opt('host')
        port = self.get_opt('port')
        self.table = self.get_opt('table')
        validate_host(host)
        validate_port(port)
        self.validate_thresholds(integer=False)

        try:
            log.info('connecting to HBase Thrift Server at %s:%s', host, port)
            # cast port to int to avoid low level socket module TypeError for ports > 32000
            self.conn = happybase.Connection(host=host,
                                             port=int(port),
                                             timeout=10 * 1000)  # ms
        except (socket.error, socket.timeout, ThriftException,
                HBaseIOError) as _:
            qquit('CRITICAL', 'error connecting: {0}'.format(_))
        tables = self.conn.tables()
        if len(tables) < 1:
            qquit('CRITICAL', 'no HBase tables found!')
        if self.get_opt('list_tables'):
            print('Tables:\n\n' + '\n'.join(tables))
            sys.exit(ERRORS['UNKNOWN'])
        if self.table:
            if self.table not in tables:
                qquit('CRITICAL',
                      "HBase table '{0}' does not exist!".format(self.table))
            self.process_table(self.table)
        else:
            for table in tables:
                self.process_table(table)
        log.info('finished, closing connection')
        self.conn.close()

        imbalance = self.calculate_imbalance()

        self.msg = '{0}% region imbalance'.format(imbalance)
        self.check_thresholds(imbalance)
        self.msg += ' between HBase RegionServers hosting the most vs least number of regions'
        if self.table:
            self.msg += " for table '{0}'".format(self.table)
        else:
            self.msg += ' across all tables'
        self.msg += ' (min = {0}, max = {1})'.format(
            self.server_min_regions[1], self.server_max_regions[1])
        self.msg += " | '% region imbalance'={0}%".format(imbalance)
        self.msg += self.get_perf_thresholds()
        self.msg += ' min_regions={0} max_regions={1}'.format(
            self.server_min_regions[1], self.server_max_regions[1])
Exemplo n.º 44
0
 def run(self):
     content = self.get()
     try:
         json_dict = json.loads(content)
         if log.isEnabledFor(logging.DEBUG):
             print(jsonpp(json_dict))
             print('='*80)
         if not isDict(json_dict):
             raise ValueError('returned content is not a dict')
         status = json_dict['status']
         if status != 'success':
             qquit('CRITICAL', "request status = '{0}' (expected 'success')".format(status))
         status_code = json_dict['statusCode']
         if status_code != 200:
             qquit('CRITICAL', "request status code = '{0}' (expected '200')".format(status_code))
         message = json_dict['message']
         data = json_dict['data']
         if not data:
             num_endpoints = 0
         elif not isList(data):
             qquit('CRITICAL', 'non-list returned for policy end points data')
         else:
             num_endpoints = len(data)
         match = re.match(message, r'Total [(\d+)] policy engine end point\(s\) found', re.I)
         if not match:
             raise ValueError('failed to parse message for confirmation of number of endpoints')
         message_num_endpoints = int(match.group(1))
         if num_endpoints != message_num_endpoints:
             raise ValueError('num endpoints does not match parsed value from returned message')
     except (KeyError, ValueError) as _:
         qquit('UNKNOWN', 'error parsing output from {software}: {exception}: {error}. {support_msg}'\
                          .format(software=self.software,
                                  exception=type(_).__name__,
                                  error=_,
                                  support_msg=support_msg_api()))
     self.msg = "{software} number of policy end points = {num_endpoints}"\
                .format(software=self.software, num_endpoints=num_endpoints)
     self.check_thresholds(num_endpoints)
     self.msg += ' | num_endpoints={num_endpoints}'.format(num_endpoints=num_endpoints) + self.get_perf_thresholds()
Exemplo n.º 45
0
 def main(self):
     # DEBUG env var is picked up immediately in pylib utils, do not override it here if so
     if os.getenv('DEBUG'):
         log.setLevel(logging.DEBUG)
     if not log.isEnabledFor(logging.DEBUG) and \
        not log.isEnabledFor(logging.ERROR): # do not downgrade logging either
         log.setLevel(logging.WARN)
     self.setup()
     try:
         self.add_options()
         self.add_default_opts()
     except InvalidOptionException as _:
         self.usage(_)
     try:
         self.__parse_args__()
         # broken
         # autoflush()
         # too late
         # os.environ['PYTHONUNBUFFERED'] = "anything"
         log.info('Hari Sekhon %s', self.version)
         log.info(self._github_repo)
         log.info('verbose level: %s (%s)', self.verbose,
                  logging.getLevelName(log.getEffectiveLevel()))
         if self.timeout is not None:
             validate_int(self.timeout, 'timeout', 0, self.timeout_max)
             log.debug('setting timeout alarm (%s)', self.timeout)
             signal.signal(signal.SIGALRM, self.timeout_handler)
             signal.alarm(int(self.timeout))
         # if self.options.version:
         #     print(self.version)
         #     sys.exit(ERRORS['UNKNOWN'])
         self.process_options()
         self.process_args()
         try:
             self.run()
         except CriticalError as _:
             qquit('CRITICAL', _)
         except WarningError as _:
             qquit('WARNING', _)
         except UnknownError as _:
             qquit('UNKNOWN', _)
         self.__end__()
     except InvalidOptionException as _:
         if log.isEnabledFor(logging.DEBUG):
             log.debug(traceback.format_exc())
         self.usage(_)  # pragma: no cover
     except KeyboardInterrupt:
         # log.debug('Caught control-c...')
         print('Caught control-c...')  # pragma: no cover
 def process_table(self, table):
     try:
         table_handle = self.conn.table(table)
         regions = table_handle.regions()
         if len(regions) < 1:
             qquit('UNKNOWN', "no regions found for table '{0}'".format(table))
         for region in regions:
             log.debug("table '%s' region '%s'", table, region)
             server = region['server_name']
             self.server_region_counts[server] = self.server_region_counts.get(server, 0)
             self.server_region_counts[server] += 1
     except (socket.error, socket.timeout, ThriftException, HBaseIOError) as _:
         qquit('CRITICAL', _)
     except KeyError as _:
         qquit('UNKNOWN', 'failed to process region information. ' + support_msg_api())
Exemplo n.º 47
0
    def run(self):
        self.no_args()
        host = self.options.host
        port = self.options.port
        validate_host(host)
        validate_port(port)

        url = 'http://%(host)s:%(port)s/oozie/v1/admin/status' % locals()
        log.debug('GET %s' % url)
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException as _:
            qquit('CRITICAL', _)
        log.debug("response: %s %s" % (req.status_code, req.reason))
        log.debug("content: '%s'" % req.content)
        if req.status_code != 200:
            qquit('CRITICAL',
                  "Non-200 response! %s %s" % (req.status_code, req.reason))
        # should == NORMAL
        if not isJson(req.content):
            qquit(
                'UNKNOWN',
                'non-JSON returned by Oozie server at {0}:{1}'.format(
                    host, port))
        status = None
        try:
            _ = json.loads(req.content)
            status = _['systemMode']
        except KeyError:
            qquit(
                'UNKNOWN',
                'systemMode key was not returned in output from Oozie. {0}'.
                format(support_msg_api()))
        self.msg = 'Oozie status = {0}'.format(status)
        if status == 'NORMAL':
            self.ok()
        else:
            self.critical()
Exemplo n.º 48
0
 def run(self):
     initial_start = time.time()
     try:
         connect_time = self.connect()
         if self.list_tables:
             tables = self.get_tables()
             print('HBase Tables:\n\n' + '\n'.join(tables))
             sys.exit(ERRORS['UNKNOWN'])
         self.check_table()
         log.info('finished, closing connection')
         self.conn.close()
     except HBaseIOError as _:
         #if 'org.apache.hadoop.hbase.TableNotFoundException' in _.message:
         if 'TableNotFoundException' in _.message:
             qquit('CRITICAL', 'table \'{0}\' does not exist'.format(self.table))
         elif 'NoSuchColumnFamilyException' in _.message:
             qquit('CRITICAL', 'column family \'{0}\' does not exist'.format(self.column))
         else:
             qquit('CRITICAL', _)
     except (socket.timeout, ThriftException) as _:
         qquit('CRITICAL', _)
     total_time = (time.time() - initial_start) * 1000
     self.output(connect_time, total_time)
 def parse(json_data):
     try:
         # it's already nicely layed out
         #if log.isEnabledFor(logging.DEBUG):
         #    log.debug('%s', jsonpp(json_data))
         compaction_queue_size = None
         for bean in json_data['beans']:
             if bean['name'] == 'Hadoop:service=HBase,name=RegionServer,sub=Server':
                 if log.isEnabledFor(logging.DEBUG):
                     log.debug('found RegionServer section:')
                     log.debug('%s', jsonpp(bean))
                 compaction_queue_size = bean['compactionQueueLength']
                 if not isInt(compaction_queue_size):
                     qquit('UNKNOWN', 'non-integer returned for compactionQueueLength! ' + support_msg_api())
                 return compaction_queue_size
     except KeyError as _:
         qquit('UNKNOWN', _ + ': failed to parse HBase Master jmx info. ' + support_msg_api())
     qquit('UNKNOWN', 'RegionServer mbean not found, double check this is pointing to an HBase RegionServer')
Exemplo n.º 50
0
    def run(self):
        self.no_args()
        host = self.get_opt('host')
        port = self.get_opt('port')
        validate_host(host)
        validate_port(port)
        self.validate_thresholds()

        # observed bug in HDP 2.3 (HBase 1.1.2) where the JMX metric from HMaster UI /jmx is displaying 0 for
        # ritOldestAge, despite currently having regions stuck in transition for a large number of ms
        # [ {"name":"Hadoop:service=HBase,name=Master,sub=AssignmentManger", ..., "ritCountOverThreshold" : 0 }
        # https://issues.apache.org/jira/browse/HBASE-16636
        #url = 'http://%(host)s:%(port)s/jmx' % locals()
        # could get info from flat txt debug page but it doesn't contain the summary count
        #url = 'http://%(host)s:%(port)s/dump' % locals()
        url = 'http://%(host)s:%(port)s/master-status' % locals()
        log.debug('GET %s', url)
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException as _:
            qquit('CRITICAL', _)
        log.debug("response: %s %s", req.status_code, req.reason)
        log.debug("content:\n%s\n%s\n%s", '=' * 80, req.content.strip(),
                  '=' * 80)
        if req.status_code != 200:
            qquit('CRITICAL', "%s %s" % (req.status_code, req.reason))
        longest_rit_time = self.parse(req.content)
        if longest_rit_time is None:
            self.msg = 'no regions in transition'
        elif not isInt(longest_rit_time):
            qquit(
                'UNKNOWN', 'parse error - got non-integer \'{0}\' for '.format(
                    longest_rit_time) +
                'longest regions in transition time when parsing HMaster UI')
        else:
            longest_rit_time /= 1000.0
            self.msg = 'HBase region longest current transition = {0:.2f} secs'.format(
                longest_rit_time)
            self.check_thresholds(longest_rit_time)
            self.msg += ' | longest_region_in_transition={0}'.format(
                longest_rit_time)
            self.msg += self.get_perf_thresholds()
Exemplo n.º 51
0
 def get(self):
     log.info('querying %s', self.software)
     url = '{protocol}://{host}:{port}/PolicyManagement/{api_version}/configurations/pdp/end_points'\
           .format(host=self.host, port=self.port, api_version=self.api_version, protocol=self.protocol)
     log.debug('GET %s', url)
     try:
         req = requests.get(url, auth=HTTPBasicAuth(self.user, self.password))
     except requests.exceptions.RequestException as _:
         errhint = ''
         if 'BadStatusLine' in str(_.message):
             errhint = ' (possibly connecting to an SSL secured port without using --ssl?)'
         elif self.protocol == 'https' and 'unknown protocol' in str(_.message):
             errhint = ' (possibly connecting to a plain HTTP port with the -S / --ssl switch enabled?)'
         qquit('CRITICAL', str(_) + errhint)
     log.debug("response: %s %s", req.status_code, req.reason)
     log.debug("content:\n%s\n%s\n%s", '='*80, req.content.strip(), '='*80)
     if req.status_code == 404 and req.reason == 'Not Found':
         qquit('CRITICAL', '{0}: {1} (no end points?)'.format(req.status_code, req.reason))
     if req.status_code != 200:
         qquit('CRITICAL', '{0}: {1}'.format(req.status_code, req.reason))
     return req.content
Exemplo n.º 52
0
 def get_version(self):
     log.info('querying %s', self.software)
     url = 'http://{host}:{port}/home'.format(host=self.host, port=self.port)
     log.debug('GET %s', url)
     try:
         req = requests.get(url)
     except requests.exceptions.RequestException as _:
         qquit('CRITICAL', _)
     log.debug("response: %s %s", req.status_code, req.reason)
     log.debug("content:\n%s\n%s\n%s", '='*80, req.content.strip(), '='*80)
     if req.status_code != 200:
         qquit('CRITICAL', "{0} {1}".format(req.status_code, req.reason))
     soup = BeautifulSoup(req.content, 'html.parser')
     if log.isEnabledFor(logging.DEBUG):
         log.debug("BeautifulSoup prettified:\n{0}\n{1}".format(soup.prettify(), '='*80))
     try:
         #version = soup.find('span', {'class': 'version'}).text
         version = soup.find('span', class_='version').text
     except (AttributeError, TypeError) as _:
         qquit('UNKNOWN', 'failed to find parse {0} output. {1}\n{2}'\
                          .format(self.software, support_msg_api(), traceback.format_exc()))
     return version
Exemplo n.º 53
0
 def get_version(self):
     log.info('querying %s', self.software)
     url = 'http://{host}:{port}/version'.format(host=self.host,
                                                 port=self.port)
     log.debug('GET %s', url)
     try:
         req = requests.get(url)
     except requests.exceptions.RequestException as _:
         qquit('CRITICAL', _)
     log.debug("response: %s %s", req.status_code, req.reason)
     log.debug("content:\n%s\n%s\n%s", '=' * 80, req.content.strip(),
               '=' * 80)
     if req.status_code != 200:
         qquit('CRITICAL', '{0}: {1}'.format(req.status_code, req.reason))
     try:
         json_dict = json.loads(req.content)
         version = json_dict['etcdserver']
         cluster_version = json_dict['etcdcluster']
     except KeyError as _:
         qquit('UNKNOWN', 'error parsing output from {software}: {error}. {support_msg}'\
                          .format(software=self.software, error=_, support_msg=support_msg_api()))
     return (version, cluster_version)
 def check_workflow(self, workflow_name, workflow_id, max_age=None, max_runtime=None):
     log.info("checking workflow '%s' id '%s'", workflow_name, workflow_id)
     (req, self.query_time) = self.req(url='{url_base}/workflow/publish/getWorkflowExecutionHistory'
                                       .format(url_base=self.url_base),
                                       # orders by newest first, but seems to return last 10 anyway
                                       body=json.dumps({'chunk_size': 1,
                                                        'currentPage': 1,
                                                        'wfName': workflow_name,
                                                        'wfId': workflow_id}))
     info = ''
     if workflow_name:
         info += " name '{0}'".format(workflow_name)
     if workflow_id:
         info += " id '{0}'".format(workflow_id)
     try:
         json_dict = json.loads(req.content)
         result = json_dict['result']
         not_found_err = '{0}. {1}'.format(info, self.extract_response_message(json_dict)) + \
                         'Perhaps you specified the wrong name/id? Use --list to see existing workflows'
         if result is None:
             qquit('CRITICAL', "no results found for workflow{0}".format(not_found_err))
         reports = result['jobExecutionReports']
         if not isList(reports):
             raise ValueError('jobExecutionReports is not a list')
         if not reports:
             qquit('CRITICAL', "no reports found for workflow{0}".format(not_found_err))
         # orders by newest first by default, checking last run only
         report = reports[0]
         status = report['status']
         if status == 'SUCCESS':
             self.ok()
         else:
             self.critical()
         self.msg += "workflow '{workflow}' id '{id}' status = '{status}'".format(workflow=report['wfName'],
                                                                                  id=report['wfId'],
                                                                                  status=status)
         self.check_times(report['startDate'], report['endDate'], max_age, max_runtime)
     except (KeyError, ValueError) as _:
         qquit('UNKNOWN', 'error parsing workflow execution history: {0}'.format(_))
Exemplo n.º 55
0
 def get_version(self):
     url = 'http://{host}:{port}/solr/admin/info/system'.format(
         host=self.host, port=self.port)
     log.debug('GET %s', url)
     try:
         req = requests.get(url)
     except requests.exceptions.RequestException as _:
         qquit('CRITICAL', _)
     log.debug('response: %s %s', req.status_code, req.reason)
     log.debug('content:\n%s\n%s\n%s', '=' * 80, req.content.strip(),
               '=' * 80)
     if req.status_code != 200:
         qquit('CRITICAL', '%s %s' % (req.status_code, req.reason))
     soup = BeautifulSoup(req.content, 'html.parser')
     if log.isEnabledFor(logging.DEBUG):
         log.debug("BeautifulSoup prettified:\n{0}\n{1}".format(
             soup.prettify(), '=' * 80))
     try:
         version = soup.find('str', {'name': 'solr-spec-version'}).text
     except (AttributeError, TypeError) as _:
         qquit('UNKNOWN', 'failed to find parse Solr output. {0}\n{1}'\
                          .format(support_msg_api(), traceback.format_exc()))
     return version
    def check_table_regions(self):
        log.info('checking regions for table \'%s\'', self.table)
        regions = None
        try:
            table = self.conn.table(self.table)
            regions = table.regions()
        except HBaseIOError as _:
            #if 'org.apache.hadoop.hbase.TableNotFoundException' in _.message:
            if 'TableNotFoundException' in _.message:
                qquit('CRITICAL',
                      'table \'{0}\' does not exist'.format(self.table))
            else:
                qquit('CRITICAL', _)
        except (socket.error, socket.timeout, ThriftException) as _:
            qquit('CRITICAL', _)

        if log.isEnabledFor(logging.DEBUG):
            log.debug('%s', jsonpp(regions))
        if not regions:
            qquit('CRITICAL',
                  'failed to get regions for table \'{0}\''.format(self.table))
        if not isList(regions):
            qquit('UNKNOWN',
                  'region info returned is not a list! ' + support_msg_api())
        num_regions = len(regions)
        log.info('num regions: %s', num_regions)

        self.msg = 'HBase table \'{0}\' has {1} region{2}'.format(
            self.table, num_regions, plural(num_regions))
        self.check_thresholds(num_regions)

        num_unassigned_regions = 0
        for region in regions:
            try:
                if not region['server_name']:
                    #log.debug('region \'%s\' is not assigned to any server', region['name'])
                    num_unassigned_regions += 1
            except KeyError as _:
                qquit(
                    'UNKNOWN', 'failed to find server assigned to region. ' +
                    support_msg_api())
        log.info('num unassigned regions: %s', num_unassigned_regions)
        self.msg += ', {0} unassigned region{1}'.format(
            num_unassigned_regions, plural(num_unassigned_regions))
        if num_unassigned_regions > 0:
            self.warning()
            self.msg += '!'

        self.msg += ' |'
        self.msg += ' num_regions={0}'.format(
            num_regions) + self.get_perf_thresholds(boundary='lower')
        self.msg += ' num_unassigned_regions={0};1;0'.format(
            num_unassigned_regions)
        log.info('finished, closing connection')
        self.conn.close()
 def parse_is_table_compacting(content):
     soup = BeautifulSoup(content, 'html.parser')
     if log.isEnabledFor(logging.DEBUG):
         log.debug("BeautifulSoup prettified:\n{0}\n{1}".format(
             soup.prettify(), '=' * 80))
     try:
         headings = soup.findAll('h2')
         for heading in headings:
             log.debug("checking heading '%s'", heading)
             if heading.get_text() == 'Table Attributes':
                 log.debug('found Table Attributes section header')
                 table = heading.find_next('table')
                 log.debug('checking first following table')
                 if log.isEnabledFor(logging.DEBUG):
                     log.debug('table:\n%s\n%s', table.prettify(), '=' * 80)
                 rows = table.findChildren('tr')
                 if len(rows) < 3:
                     qquit(
                         'UNKNOWN',
                         'parse error - less than the 3 expected rows in table attributes'
                     )
                 col_names = rows[0].findChildren('th')
                 if len(col_names) < 3:
                     qquit(
                         'UNKNOWN',
                         'parse error - less than the 3 expected column headings'
                     )
                 first_col = col_names[0].get_text().strip()
                 if first_col != 'Attribute Name':
                     qquit('UNKNOWN',
                           'parse error - expected first column header to be \'{0}\' but got \'\' instead. '\
                           .format('Attribute Name')
                           + support_msg())
                 for row in rows[1:]:
                     cols = row.findChildren('td')
                     if len(cols) < 3:
                         qquit(
                             'UNKNOWN',
                             'parse error - less than the 3 expected columns in table attributes. '
                             + support_msg())
                     if cols[0].get_text().strip() == 'Compaction':
                         compaction_state = cols[1].get_text().strip()
                         # NONE when enabled, Unknown when disabled
                         if compaction_state in ('NONE', 'Unknown'):
                             return False
                         return True
         qquit(
             'UNKNOWN',
             'parse error - failed to find Table Attributes section in JSP. '
             + support_msg())
     except (AttributeError, TypeError):
         qquit('UNKNOWN', 'failed to parse output. ' + support_msg())
Exemplo n.º 58
0
 def sanity_check(condition, msg):
     if not condition:
         qquit(
             'UNKNOWN', 'HBase attribute table header ' + msg +
             ', failed sanity check! ' + support_msg())
Exemplo n.º 59
0
 def get_table_conn(self):
     log.info('checking table \'%s\'', self.table)
     if not self.conn.is_table_enabled(self.table):
         qquit('CRITICAL', "table '{0}' is not enabled!".format(self.table))
     table_conn = self.conn.table(self.table)
     return table_conn
Exemplo n.º 60
0
    def run(self):
        self.no_args()
        host = self.get_opt('host')
        port = self.get_opt('port')
        slave = self.get_opt('slave')
        list_slaves = self.get_opt('list_slaves')
        validate_host(host)
        validate_port(port)
        if not list_slaves:
            validate_host(slave, 'slave')

        url = 'http://%(host)s:%(port)s/master/slaves' % locals()
        log.debug('GET %s', url)
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException as _:
            qquit('CRITICAL', _)
        log.debug("response: %s %s", req.status_code, req.reason)
        log.debug("content:\n{0}\n{1}\n{2}".format('='*80, req.content.strip(), '='*80))
        if req.status_code != 200:
            if req.status_code == 404:
                qquit('CRITICAL', '%s %s (did you point this at the correct Mesos Master?)'
                                  % (req.status_code, req.reason))
            qquit('CRITICAL', "Non-200 response! %s %s" % (req.status_code, req.reason))
        content = req.content
        if not isJson(content):
            qquit('UNKNOWN', 'invalid JSON returned by Mesos Master')
        data = json.loads(content)
        if log.isEnabledFor(logging.DEBUG):
            log.debug('\n%s', jsonpp(data))
        slaves = {}
        regex = re.compile(r'^slave\(\d+\)\@(.+):\d+')
        try:
            for item in data['slaves']:
                match = regex.match(item['pid'])
                if match:
                    slaves[item['hostname']] = match.group(1)
                else:
                    slaves[item['hostname']] = item['pid']
        except KeyError:
            qquit('UNKNOWN', 'failed to parse slaves from Mesos API output. {0}'.format(support_msg_api))
        if list_slaves:
            qquit('UNKNOWN', 'Slaves list:\n\n{0}'.format(dict_lines(slaves)))
        log.info('found slaves:\n\n{0}\n'.format(dict_lines(slaves)))
        slave = slave.lower()
        for _ in slaves:
            if slave == _.lower() or slave == slaves[_].lower():
                qquit('OK', "Mesos slave '{0}' registered with master".format(slave))
                break
        else:
            qquit('CRITICAL', "Mesos slave '{0}' not registered with master".format(slave))