def default_health_value(name, service, operation, failed_step): result = False if len([event for event in health_values if event.metric == name]) == 0: if failed_step is not None: message = 'Did not attempt to %s due to timeout waiting for: %s' % (operation, failed_step) else: message = 'Timed out waiting for %s to complete' % operation health_values.append(Event(TIMESTAMP_MILLIS(), cdh.get_name(service), name, [message], False)) result = True return result
def get_uncleanleaderelections(self, host, broker_id): ''' Get UncleanLeaderElectionsPerSec ''' unclean_count = None unclean_rate = None for jmx_data in [ "RateUnit", "OneMinuteRate", "EventType", "Count", "FifteenMinuteRate", "FiveMinuteRate", "MeanRate" ]: url_jmxproxy = ("http://127.0.0.1:8000/jmxproxy/%s/" "kafka.controller:type=ControllerStats," "name=UncleanLeaderElectionsPerSec/%s") % ( host, jmx_data) response = requests.get(url_jmxproxy) if response.status_code == 200: LOGGER.debug("Getting %s fo %s", response.text, url_jmxproxy) self.results.append( Event(TIMESTAMP_MILLIS(), 'kafka', ('kafka.brokers.%d.' 'controllerstats.UncleanLeaderElections.%s') % (broker_id, jmx_data), [], response.text)) if jmx_data == "Count": unclean_count = int(response.text) elif jmx_data == "FifteenMinuteRate": unclean_rate = Decimal(response.text) else: LOGGER.error("ERROR for url_jmxproxy: %s", url_jmxproxy) if unclean_count is not None and unclean_rate is not None: if unclean_rate > 0.0002: LOGGER.debug( "broker %d threshold is %f and current rate is %f", broker_id, 0.0002, unclean_rate) self.whitebox_error_code = 104 return None
def get_underreplicatedpartitions(self, host, broker_id): ''' Get underreplicatedpartitions ''' url_jmxproxy = ("http://127.0.0.1:8000/jmxproxy/%s/" "kafka.server:type=ReplicaManager," "name=UnderReplicatedPartitions/Value") % host response = requests.get(url_jmxproxy) if response.status_code == 200: LOGGER.debug("Getting %s fo %s", response.text, url_jmxproxy) self.results.append( Event(TIMESTAMP_MILLIS(), 'kafka', 'kafka.brokers.%d.UnderReplicatedPartitions' % broker_id, [], response.text)) if response.text != "0": self.whitebox_error_code = 101 else: LOGGER.error("ERROR for url_jmxproxy: %s", url_jmxproxy) return None
def analyse_results(zk_data, zk_election): ''' Analyse the partition summary and Prod2Cons Then set the the test result flag accordingly I the test flag is not green, put a reason explaining why Then return a json ''' analyse_status = MonitorStatus["green"] analyse_causes = [] analyse_metric = 'zookeeper.health' if zk_data and len(zk_data.list_zk_ko) > 0: LOGGER.error("analyse_results : at least one zookeeper node failed") analyse_status = MonitorStatus["red"] analyse_causes.append("zookeeper node(s) unreachable (%s)" % zk_data.list_zk_ko) elif zk_election is False: LOGGER.error( "analyse_results : zookeeper election not done, check nodes mode") analyse_status = MonitorStatus["red"] analyse_causes.append("zookeeper election not done, check nodes mode") return Event(TIMESTAMP_MILLIS(), 'zookeeper', analyse_metric, analyse_causes, analyse_status)
def get_activecontrollercount(self, host, broker_id): ''' Get activecontrollercount ''' url_jmxproxy = ("http://127.0.0.1:8000/jmxproxy/%s/" "kafka.controller:type=KafkaController," "name=ActiveControllerCount/Value") % host response = requests.get(url_jmxproxy) if response.status_code == 200: LOGGER.debug("Getting %s fo %s", response.text, url_jmxproxy) self.results.append( Event(TIMESTAMP_MILLIS(), 'kafka', 'kafka.brokers.%d.ActiveControllerCount' % broker_id, [], response.text)) if self.activecontrollercount != -1 and response.text == 1: self.activecontrollercount = 1 elif self.activecontrollercount == 1 and response.text == 1: self.whitebox_error_code = 102 else: LOGGER.error("ERROR for url_jmxproxy: %s", url_jmxproxy) return None
def runner(self, args, display=True): values = [] health_values = [] plugin_args = args.split() \ if args is not None and (len(args.strip()) > 0) \ else "" options = self.read_args(plugin_args) if options.hadoopdistro == 'CDH': api = ApiResource(server_host=options.cmhost, \ server_port=options.cmport, \ username=options.cmuser, \ password=options.cmpassword, \ version=11) cluster = api.get_cluster(api.get_all_clusters()[0].name) cdh = CDHData(api, cluster) else: cdh = HDPData(options.cmhost, options.cmuser, options.cmpassword) hbase = None def run_test_sequence(): # pylint: disable=too-many-return-statements hbase = happybase.Connection(host=cdh.get_hbase_endpoint()) if abort_test_sequence is True: return reason = [] try: start = TIMESTAMP_MILLIS() try: hbase.create_table('blackbox_test_table', {'cf': dict()}) logging.debug("test table created") except AlreadyExists: logging.debug("test table exists") table = hbase.table('blackbox_test_table') end = TIMESTAMP_MILLIS() create_table_ok = True create_table_ms = end - start values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'), "hadoop.HBASE.create_table_time_ms", [], create_table_ms)) except: LOGGER.error(traceback.format_exc()) create_table_ok = False reason = ['Create HBase table operation failed'] health_values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'), "hadoop.HBASE.create_table_succeeded", reason, create_table_ok)) #write some data to it if abort_test_sequence is True: return reason = [] try: start = TIMESTAMP_MILLIS() table.put('row_key', {'cf:column': 'value'}) end = TIMESTAMP_MILLIS() write_hbase_ok = True write_hbase_ms = end - start values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'), "hadoop.HBASE.write_time_ms", [], write_hbase_ms)) except: LOGGER.error(traceback.format_exc()) write_hbase_ok = False reason = ['Failed to insert row in HBase table'] health_values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'), "hadoop.HBASE.write_succeeded", reason, write_hbase_ok)) #read some data from it if abort_test_sequence is True: return reason = [] try: start = TIMESTAMP_MILLIS() row = table.row('row_key', columns=['cf:column']) end = TIMESTAMP_MILLIS() read_hbase_ms = end - start read_hbase_ok = row['cf:column'] == 'value' values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'), "hadoop.HBASE.read_time_ms", [], read_hbase_ms)) except: LOGGER.error(traceback.format_exc()) hbase_fix_output = subprocess.check_output([ 'sudo', '-u', 'hbase', 'hbase', 'hbck', '-repair', 'blackbox_test_table' ]) for line in hbase_fix_output.splitlines(): if 'Status:' in line or 'inconsistencies detected' in line: LOGGER.debug(line) subprocess.check_output([ 'sudo', '-u', 'hbase', 'hbase', 'zkcli', 'rmr', '/hbase/table/blackbox_test_table' ]) subprocess.check_output([ 'sudo', '-u', 'hdfs', 'hadoop', 'fs', '-rm', '-r', '-f', '-skipTrash', '/hbase/data/default/blackbox_test_table' ]) read_hbase_ok = False reason = ['Failed to fetch row by row key from HBase'] health_values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'), "hadoop.HBASE.read_succeeded", reason, read_hbase_ok)) #create some hive metadata reason = [] if abort_test_sequence is True: return try: start = TIMESTAMP_MILLIS() hive = hive_api.connect(cdh.get_hive_endpoint()) end = TIMESTAMP_MILLIS() hive.cursor().execute("DROP TABLE blackbox_test_table") connect_to_hive_ms = end - start connect_to_hive_ok = True values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HIVE'), "hadoop.HIVE.connection_time_ms", [], connect_to_hive_ms)) except: LOGGER.error(traceback.format_exc()) connect_to_hive_ok = False reason = ['Failed to connect to Hive Metastore'] health_values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HIVE'), "hadoop.HIVE.connection_succeeded", reason, connect_to_hive_ok)) if abort_test_sequence is True: return reason = [] try: start = TIMESTAMP_MILLIS() hive.cursor().execute(( "CREATE EXTERNAL TABLE " "blackbox_test_table (key STRING, value STRING)" "STORED BY \"org.apache.hadoop.hive.hbase.HBaseStorageHandler\" " "WITH SERDEPROPERTIES " "(\"hbase.columns.mapping\" = \":key,cf:column\") " "TBLPROPERTIES(\"hbase.table.name\" = \"blackbox_test_table\")" )) end = TIMESTAMP_MILLIS() create_metadata_ms = end - start create_metadata_ok = True values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HIVE'), "hadoop.HIVE.create_metadata_time_ms", [], create_metadata_ms)) except: LOGGER.error(traceback.format_exc()) create_metadata_ok = False reason = [ 'CREATE EXTERNAL TABLE statement failed on Hive Metastore' ] health_values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HIVE'), "hadoop.HIVE.create_metadata_succeeded", reason, create_metadata_ok)) #read some data via impala using it if abort_test_sequence is True: return if cdh.get_impala_endpoint() is not None: reason = [] try: start = TIMESTAMP_MILLIS() impala = connect(host=cdh.get_impala_endpoint(), port=options.impalaport) end = TIMESTAMP_MILLIS() impala.cursor().execute("invalidate metadata") connect_to_impala_ms = end - start connect_to_impala_ok = True values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('IMPALA'), "hadoop.IMPALA.connection_time_ms", [], connect_to_impala_ms)) except: LOGGER.error(traceback.format_exc()) connect_to_impala_ok = False reason = ['Failed to connect to Impala'] health_values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('IMPALA'), "hadoop.IMPALA.connection_succeeded", reason, connect_to_impala_ok)) if abort_test_sequence is True: return reason = [] try: start = TIMESTAMP_MILLIS() impala_cursor = impala.cursor() impala_cursor.execute("SELECT * FROM blackbox_test_table") table_contents = impala_cursor.fetchall() end = TIMESTAMP_MILLIS() read_impala_ms = end - start read_impala_ok = table_contents[0][1] == 'value' values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('IMPALA'), "hadoop.IMPALA.read_time_ms", [], read_impala_ms)) except: LOGGER.error(traceback.format_exc()) read_impala_ok = False reason = ['Failed to SELECT from Impala'] health_values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('IMPALA'), "hadoop.IMPALA.read_succeeded", reason, read_impala_ok)) else: reason = [] try: start = TIMESTAMP_MILLIS() hive_cursor = hive.cursor() hive_cursor.execute("SELECT * FROM blackbox_test_table") table_contents = hive_cursor.fetchall() end = TIMESTAMP_MILLIS() read_hive_ms = end - start read_hive_ok = table_contents[0][1] == 'value' values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HQUERY'), "hadoop.HQUERY.read_time_ms", [], read_hive_ms)) except: LOGGER.error(traceback.format_exc()) read_hive_ok = False reason = ['Failed to SELECT from Hive'] health_values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HQUERY'), "hadoop.HQUERY.read_succeeded", reason, read_hive_ok)) #delete metadata if abort_test_sequence is True: return reason = [] try: start = TIMESTAMP_MILLIS() hive.cursor().execute("DROP TABLE blackbox_test_table") end = TIMESTAMP_MILLIS() drop_metadata_ms = end - start drop_metadata_ok = True values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HIVE'), "hadoop.HIVE.drop_table_time_ms", [], drop_metadata_ms)) except: LOGGER.error(traceback.format_exc()) drop_metadata_ok = False reason = ['Failed to DROP table in Hive Metastore'] health_values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HIVE'), "hadoop.HIVE.drop_table_succeeded", reason, drop_metadata_ok)) #delete hbase table if abort_test_sequence is True: return reason = [] try: start = TIMESTAMP_MILLIS() # Disabled deleting table to work around apparent hbase bug (see VPP-17) but leaving # test step in so it can be easily re-enabled for testing. #hbase.disable_table('blackbox_test_table') #hbase.delete_table('blackbox_test_table') end = TIMESTAMP_MILLIS() drop_table_ms = end - start drop_table_ok = True values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'), "hadoop.HBASE.drop_table_time_ms", [], drop_table_ms)) except: LOGGER.error(traceback.format_exc()) drop_table_ok = False reason = ['Failed to drop table in HBase'] health_values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'), "hadoop.HBASE.drop_table_succeeded", reason, drop_table_ok)) def to_status(flag): ''' Convert True to OK and False to ERROR ''' if flag in [True, False]: status = 'OK' if flag is True else 'ERROR' else: status = flag return status def default_health_value(name, service, operation, failed_step): result = False if len([event for event in health_values if event.metric == name]) == 0: if failed_step is not None: message = 'Did not attempt to %s due to timeout waiting for: %s' % ( operation, failed_step) else: message = 'Timed out waiting for %s to complete' % operation health_values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name(service), name, [message], False)) result = True return result test_thread = threading.Thread(target=run_test_sequence) test_thread.daemon = True abort_test_sequence = False test_thread.start() test_thread.join(60.0) abort_test_sequence = True if hbase is not None: hbase.close() failed_step = None if default_health_value("hadoop.HBASE.create_table_succeeded", "HBASE", "create HBase table", failed_step) and failed_step is None: failed_step = "create HBase table" if default_health_value("hadoop.HBASE.write_succeeded", "HBASE", "write to HBase", failed_step) and failed_step is None: failed_step = "write to HBase" if default_health_value("hadoop.HBASE.read_succeeded", "HBASE", "read from HBase", failed_step) and failed_step is None: failed_step = "read from HBase" if default_health_value("hadoop.HIVE.connection_succeeded", "HIVE", "connect to Hive Metastore", failed_step) and failed_step is None: failed_step = "connect to Hive Metastore" if default_health_value("hadoop.HIVE.create_metadata_succeeded", "HIVE", "create Hive Metastore table", failed_step) and failed_step is None: failed_step = "create Hive Metastore table" if cdh.get_impala_endpoint() is not None: if default_health_value("hadoop.IMPALA.connection_succeeded", "IMPALA", "connect to Impala", failed_step) and failed_step is None: failed_step = "connect to Impala" if default_health_value("hadoop.IMPALA.read_succeeded", "IMPALA", "SELECT from Impala", failed_step) and failed_step is None: failed_step = "SELECT from Impala" else: if default_health_value("hadoop.HQUERY.read_succeeded", "HQUERY", "SELECT from Hive", failed_step) and failed_step is None: failed_step = "SELECT from Hive" if default_health_value("hadoop.HIVE.drop_table_succeeded", "HIVE", "DROP table in Hive Metastore", failed_step) and failed_step is None: failed_step = "DROP table in Hive Metastore" if default_health_value("hadoop.HBASE.drop_table_succeeded", "HBASE", "drop table in HBase", failed_step) and failed_step is None: failed_step = "drop table in HBase" cdh_status_indicators = cdh.get_status_indicators() health_values.extend(cdh_status_indicators) overall = {} for health_val in health_values: try: current = overall[health_val.source] current_val = to_status(current.value) current_causes = current.causes except KeyError: current_val = 'OK' current_causes = [] update = to_status(health_val.value) # If current is ERROR, output is ERROR, regardless # If current is WARN, output is WARN if update is OK but ERROR if further WARN or ERROR # If update is OK, output is OK if OK, WARN if WARN and ERROR if ERROR out = 'ERROR' if current_val != "ERROR": if current_val == 'WARN': if update == 'OK': out = 'WARN' if current_val == 'OK': out = update current_val = out current_causes.extend(health_val.causes) overall[health_val.source] = Event( health_val.timestamp, health_val.source, 'hadoop.%s.health' % cdh.get_type(health_val.source), current_causes, current_val) values.extend(health_values) values.extend(overall.values()) if display: self._do_display(values) return values
def run_test_sequence(): # pylint: disable=too-many-return-statements hbase = happybase.Connection(host=cdh.get_hbase_endpoint()) if abort_test_sequence is True: return reason = [] try: start = TIMESTAMP_MILLIS() try: hbase.create_table('blackbox_test_table', {'cf': dict()}) logging.debug("test table created") except AlreadyExists: logging.debug("test table exists") table = hbase.table('blackbox_test_table') end = TIMESTAMP_MILLIS() create_table_ok = True create_table_ms = end - start values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'), "hadoop.HBASE.create_table_time_ms", [], create_table_ms)) except: LOGGER.error(traceback.format_exc()) create_table_ok = False reason = ['Create HBase table operation failed'] health_values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'), "hadoop.HBASE.create_table_succeeded", reason, create_table_ok)) #write some data to it if abort_test_sequence is True: return reason = [] try: start = TIMESTAMP_MILLIS() table.put('row_key', {'cf:column': 'value'}) end = TIMESTAMP_MILLIS() write_hbase_ok = True write_hbase_ms = end - start values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'), "hadoop.HBASE.write_time_ms", [], write_hbase_ms)) except: LOGGER.error(traceback.format_exc()) write_hbase_ok = False reason = ['Failed to insert row in HBase table'] health_values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'), "hadoop.HBASE.write_succeeded", reason, write_hbase_ok)) #read some data from it if abort_test_sequence is True: return reason = [] try: start = TIMESTAMP_MILLIS() row = table.row('row_key', columns=['cf:column']) end = TIMESTAMP_MILLIS() read_hbase_ms = end - start read_hbase_ok = row['cf:column'] == 'value' values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'), "hadoop.HBASE.read_time_ms", [], read_hbase_ms)) except: LOGGER.error(traceback.format_exc()) hbase_fix_output = subprocess.check_output([ 'sudo', '-u', 'hbase', 'hbase', 'hbck', '-repair', 'blackbox_test_table' ]) for line in hbase_fix_output.splitlines(): if 'Status:' in line or 'inconsistencies detected' in line: LOGGER.debug(line) subprocess.check_output([ 'sudo', '-u', 'hbase', 'hbase', 'zkcli', 'rmr', '/hbase/table/blackbox_test_table' ]) subprocess.check_output([ 'sudo', '-u', 'hdfs', 'hadoop', 'fs', '-rm', '-r', '-f', '-skipTrash', '/hbase/data/default/blackbox_test_table' ]) read_hbase_ok = False reason = ['Failed to fetch row by row key from HBase'] health_values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'), "hadoop.HBASE.read_succeeded", reason, read_hbase_ok)) #create some hive metadata reason = [] if abort_test_sequence is True: return try: start = TIMESTAMP_MILLIS() hive = hive_api.connect(cdh.get_hive_endpoint()) end = TIMESTAMP_MILLIS() hive.cursor().execute("DROP TABLE blackbox_test_table") connect_to_hive_ms = end - start connect_to_hive_ok = True values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HIVE'), "hadoop.HIVE.connection_time_ms", [], connect_to_hive_ms)) except: LOGGER.error(traceback.format_exc()) connect_to_hive_ok = False reason = ['Failed to connect to Hive Metastore'] health_values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HIVE'), "hadoop.HIVE.connection_succeeded", reason, connect_to_hive_ok)) if abort_test_sequence is True: return reason = [] try: start = TIMESTAMP_MILLIS() hive.cursor().execute(( "CREATE EXTERNAL TABLE " "blackbox_test_table (key STRING, value STRING)" "STORED BY \"org.apache.hadoop.hive.hbase.HBaseStorageHandler\" " "WITH SERDEPROPERTIES " "(\"hbase.columns.mapping\" = \":key,cf:column\") " "TBLPROPERTIES(\"hbase.table.name\" = \"blackbox_test_table\")" )) end = TIMESTAMP_MILLIS() create_metadata_ms = end - start create_metadata_ok = True values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HIVE'), "hadoop.HIVE.create_metadata_time_ms", [], create_metadata_ms)) except: LOGGER.error(traceback.format_exc()) create_metadata_ok = False reason = [ 'CREATE EXTERNAL TABLE statement failed on Hive Metastore' ] health_values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HIVE'), "hadoop.HIVE.create_metadata_succeeded", reason, create_metadata_ok)) #read some data via impala using it if abort_test_sequence is True: return if cdh.get_impala_endpoint() is not None: reason = [] try: start = TIMESTAMP_MILLIS() impala = connect(host=cdh.get_impala_endpoint(), port=options.impalaport) end = TIMESTAMP_MILLIS() impala.cursor().execute("invalidate metadata") connect_to_impala_ms = end - start connect_to_impala_ok = True values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('IMPALA'), "hadoop.IMPALA.connection_time_ms", [], connect_to_impala_ms)) except: LOGGER.error(traceback.format_exc()) connect_to_impala_ok = False reason = ['Failed to connect to Impala'] health_values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('IMPALA'), "hadoop.IMPALA.connection_succeeded", reason, connect_to_impala_ok)) if abort_test_sequence is True: return reason = [] try: start = TIMESTAMP_MILLIS() impala_cursor = impala.cursor() impala_cursor.execute("SELECT * FROM blackbox_test_table") table_contents = impala_cursor.fetchall() end = TIMESTAMP_MILLIS() read_impala_ms = end - start read_impala_ok = table_contents[0][1] == 'value' values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('IMPALA'), "hadoop.IMPALA.read_time_ms", [], read_impala_ms)) except: LOGGER.error(traceback.format_exc()) read_impala_ok = False reason = ['Failed to SELECT from Impala'] health_values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('IMPALA'), "hadoop.IMPALA.read_succeeded", reason, read_impala_ok)) else: reason = [] try: start = TIMESTAMP_MILLIS() hive_cursor = hive.cursor() hive_cursor.execute("SELECT * FROM blackbox_test_table") table_contents = hive_cursor.fetchall() end = TIMESTAMP_MILLIS() read_hive_ms = end - start read_hive_ok = table_contents[0][1] == 'value' values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HQUERY'), "hadoop.HQUERY.read_time_ms", [], read_hive_ms)) except: LOGGER.error(traceback.format_exc()) read_hive_ok = False reason = ['Failed to SELECT from Hive'] health_values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HQUERY'), "hadoop.HQUERY.read_succeeded", reason, read_hive_ok)) #delete metadata if abort_test_sequence is True: return reason = [] try: start = TIMESTAMP_MILLIS() hive.cursor().execute("DROP TABLE blackbox_test_table") end = TIMESTAMP_MILLIS() drop_metadata_ms = end - start drop_metadata_ok = True values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HIVE'), "hadoop.HIVE.drop_table_time_ms", [], drop_metadata_ms)) except: LOGGER.error(traceback.format_exc()) drop_metadata_ok = False reason = ['Failed to DROP table in Hive Metastore'] health_values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HIVE'), "hadoop.HIVE.drop_table_succeeded", reason, drop_metadata_ok)) #delete hbase table if abort_test_sequence is True: return reason = [] try: start = TIMESTAMP_MILLIS() # Disabled deleting table to work around apparent hbase bug (see VPP-17) but leaving # test step in so it can be easily re-enabled for testing. #hbase.disable_table('blackbox_test_table') #hbase.delete_table('blackbox_test_table') end = TIMESTAMP_MILLIS() drop_table_ms = end - start drop_table_ok = True values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'), "hadoop.HBASE.drop_table_time_ms", [], drop_table_ms)) except: LOGGER.error(traceback.format_exc()) drop_table_ok = False reason = ['Failed to drop table in HBase'] health_values.append( Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'), "hadoop.HBASE.drop_table_succeeded", reason, drop_table_ok))
class TestCDHBlackboxPlugin(unittest.TestCase): ''' Set of unit tests designed to validate cdh-blackbox Plugin ''' @mock.patch('plugins.cdh_blackbox.TestbotPlugin.ApiResource') @mock.patch('plugins.cdh_blackbox.TestbotPlugin.happybase.Connection') @mock.patch('plugins.cdh_blackbox.TestbotPlugin.jPype') @mock.patch('plugins.cdh_blackbox.TestbotPlugin.jdbApi') @mock.patch('plugins.cdh_blackbox.TestbotPlugin.connect') @mock.patch('plugins.cdh_blackbox.TestbotPlugin.CDHData.get_name', lambda s, x: {'HBASE': 'hbase01', 'IMPALA': 'impala01', 'HIVE': 'hive01'}[x]) @mock.patch('plugins.cdh_blackbox.TestbotPlugin.CDHData.get_type', lambda s, x: {'hbase01': 'HBASE', 'impala01': 'IMPALA', 'hive01': 'HIVE'}[x]) @mock.patch('plugins.cdh_blackbox.TestbotPlugin.CDHData.get_hbase_endpoint', lambda s: '0.0.0.0') @mock.patch('plugins.cdh_blackbox.TestbotPlugin.CDHData.get_hive_endpoint', lambda s: '0.0.0.0') @mock.patch('plugins.cdh_blackbox.TestbotPlugin.CDHData.get_impala_endpoint', lambda s: '0.0.0.0') @mock.patch('plugins.cdh_blackbox.TestbotPlugin.CDHData.get_status_indicators', lambda s: []) def test_pass_simple(self, impala_connect_mock, hive_mock, jpype_mock, happybase_connection_mock, api_mock): # mock HBase connection.table.row _table = mock.MagicMock() _table.row.return_value = {'cf:column': 'value'} _hbase_conn = mock.MagicMock() _hbase_conn.table.return_value = _table happybase_connection_mock.return_value = _hbase_conn # mock Impala connection.cursor.fetchall _cursor = mock.MagicMock() _cursor.fetchall.return_value = [[None, 'value']] _impala_conn = mock.MagicMock() _impala_conn.cursor.return_value = _cursor impala_connect_mock.return_value = _impala_conn # mock JayDeBe connection and JPype _jpype = mock.MagicMock() _jpype.getDefaultJVMPath.return_value = None _jpype.startJVM.return_value = None jpype_mock.return_value = _jpype _hive_cursor = mock.MagicMock() _hive_cursor.fetchall.return_value = [[None, 'value']] _hive_conn = mock.MagicMock() _hive_conn.cursor.return_value = _hive_cursor hive_mock.return_value = _hive_conn plugin = CDHBlackboxPlugin() values = plugin.runner(("--cmhost 10.60.18.144 --cmhost 7777 " "--cmuser user --cmpassword password --hadoopdistro CDH"), True) api_mock.assert_called_with(password='******', server_host='7777', server_port='7180', username='******', version=11) num_res = [('hbase01', 'hadoop.HBASE.create_table_time_ms', [], 5), ('hbase01', 'hadoop.HBASE.write_time_ms', [], 7), ('hbase01', 'hadoop.HBASE.read_time_ms', [], 1), ('hive01', 'hadoop.HIVE.connection_time_ms', [], 7), ('hive01', 'hadoop.HIVE.create_metadata_time_ms', [], 2), ('impala01', 'hadoop.IMPALA.connection_time_ms', [], 0), ('impala01', 'hadoop.IMPALA.read_time_ms', [], 4), ('hive01', 'hadoop.HIVE.drop_table_time_ms', [], 3), ('hbase01', 'hadoop.HBASE.drop_table_time_ms', [], 7)] gen_res = [('hbase01', 'hadoop.HBASE.create_table_succeeded', [], True), ('hbase01', 'hadoop.HBASE.write_succeeded', [], True), ('hbase01', 'hadoop.HBASE.read_succeeded', [], True), ('hive01', 'hadoop.HIVE.connection_succeeded', [], True), ('hive01', 'hadoop.HIVE.create_metadata_succeeded', [], True), ('impala01', 'hadoop.IMPALA.connection_succeeded', [], True), ('impala01', 'hadoop.IMPALA.read_succeeded', [], True), ('hive01', 'hadoop.HIVE.drop_table_succeeded', [], True), ('hbase01', 'hadoop.HBASE.drop_table_succeeded', [], True), ('hbase01', 'hadoop.HBASE.health', [], 'OK'), ('impala01', 'hadoop.IMPALA.health', [], 'OK'), ('hive01', 'hadoop.HIVE.health', [], 'OK') ] self.assertEqual(len(values), len(num_res) + len(gen_res)) index = 0 for check in num_res: self.assertEqual(check[0], values[index].source) self.assertEqual(check[1], values[index].metric) self.assertEqual(check[2], values[index].causes) self.assertTrue(isinstance(values[index].value, int) or isinstance(values[index].value, long)) index += 1 for check in gen_res: self.assertEqual(check[0], values[index].source) self.assertEqual(check[1], values[index].metric) self.assertEqual(check[2], values[index].causes) self.assertEqual(check[3], values[index].value) index += 1 @mock.patch('plugins.cdh_blackbox.TestbotPlugin.ApiResource') @mock.patch('plugins.cdh_blackbox.TestbotPlugin.happybase.Connection') @mock.patch('plugins.cdh_blackbox.TestbotPlugin.jPype') @mock.patch('plugins.cdh_blackbox.TestbotPlugin.jdbApi') @mock.patch('plugins.cdh_blackbox.TestbotPlugin.connect') @mock.patch('plugins.cdh_blackbox.TestbotPlugin.CDHData.get_name', lambda s, x: {'HBASE': 'hbase01', 'IMPALA': 'impala01', 'HIVE': 'hive01', 'HDFS': 'hdfs01'}[x]) @mock.patch('plugins.cdh_blackbox.TestbotPlugin.CDHData.get_type', lambda s, x: {'hbase01': 'HBASE', 'impala01': 'IMPALA', 'hive01': 'HIVE', 'hdfs01': 'HDFS'}[x]) @mock.patch('plugins.cdh_blackbox.TestbotPlugin.CDHData.get_hbase_endpoint', lambda s: '0.0.0.0') @mock.patch('plugins.cdh_blackbox.TestbotPlugin.CDHData.get_hive_endpoint', lambda s: '0.0.0.0') @mock.patch('plugins.cdh_blackbox.TestbotPlugin.CDHData.get_impala_endpoint', lambda s: '0.0.0.0') @mock.patch('plugins.cdh_blackbox.TestbotPlugin.CDHData.get_status_indicators', lambda s: [Event(0, 'hdfs01', 'hadoop.HDFS.cm_indicator', [], 'OK'), Event(0, 'hbase01', 'hadoop.HBASE.cm_indicator', ['Cause A'], 'WARN'), Event(0, 'hive01', 'hadoop.HIVE.cm_indicator', ['Cause B'], 'ERROR'), Event(0, 'impala01', 'hadoop.IMPALA.cm_indicator', ['Cause C', 'Cause D'], 'WARN'), ]) def test_merge_simple(self, impala_connect_mock, hive_mock, jpype_mock, happybase_connection_mock, api_mock): # mock HBase connection.table.row with success _table = mock.MagicMock() _table.row.return_value = {'cf:column': 'value'} _hbase_conn = mock.MagicMock() _hbase_conn.table.return_value = _table happybase_connection_mock.return_value = _hbase_conn # mock Impala connection.cursor.fetchall with failure _cursor = mock.MagicMock() _cursor.fetchall.side_effect = Exception() _impala_conn = mock.MagicMock() _impala_conn.cursor.return_value = _cursor impala_connect_mock.return_value = _impala_conn # mock JayDeBe connection and JPype _jpype = mock.MagicMock() _jpype.getDefaultJVMPath.return_value = None _jpype.startJVM.return_value = None jpype_mock.return_value = _jpype _hive_cursor = mock.MagicMock() _hive_cursor.fetchall.return_value = [[None, 'value']] _hive_conn = mock.MagicMock() _hive_conn.cursor.return_value = _hive_cursor hive_mock.return_value = _hive_conn plugin = CDHBlackboxPlugin() values = plugin.runner(("--cmhost 10.60.18.144 --cmhost 7777 " "--cmuser user --cmpassword password --hadoopdistro CDH"), True) api_mock.assert_called_with(password='******', server_host='7777', server_port='7180', username='******', version=11) for value in values: if value.metric == 'hadoop.HDFS.health': self.assertEqual(value.value, 'OK') if value.metric == 'hadoop.HBASE.health': self.assertEqual(value.value, 'WARN') self.assertEqual(value.causes, ['Cause A']) if value.metric == 'hadoop.HIVE.health': self.assertEqual(value.value, 'ERROR') self.assertEqual(value.causes, ['Cause B']) if value.metric == 'hadoop.IMPALA.health': self.assertEqual(value.value, 'ERROR') self.assertEqual(value.causes, ['Failed to SELECT from Impala', 'Cause C', 'Cause D']) def test_cdhdata(self): ''' Test that CDHData behaves as expected given a particular response from CM ''' scheck = {'name': 'service check', 'explanation': 'service broken', 'summary': 'BAD'} rcheck = {'name': 'role check', 'explanation': 'role broken', 'summary': 'BAD'} hcheck = {'name': 'host check', 'explanation': 'host broken', 'summary': 'BAD'} role1 = mock.MagicMock(type='HBASETHRIFTSERVER', healthChecks=[rcheck], hostRef=mock.MagicMock(hostId=42)) role2 = mock.MagicMock(type='HIVESERVER2', healthChecks=[rcheck], hostRef=mock.MagicMock(hostId=42)) role3 = mock.MagicMock(type='IMPALAD', healthChecks=[rcheck], hostRef=mock.MagicMock(hostId=42)) service1 = mock.MagicMock(type='HBASE', healthSummary='CONCERNING', healthChecks=[scheck]) service1.name = 'hbase01' service1.get_all_roles.return_value = [role1] service2 = mock.MagicMock(type='HIVE', healthSummary='WARN', healthChecks=[scheck]) service2.name = 'hive01' service2.get_all_roles.return_value = [role2] service3 = mock.MagicMock(type='IMPALA', healthSummary='BAD', healthChecks=[scheck]) service3.name = 'impala01' service3.get_all_roles.return_value = [role3] host = mock.MagicMock(hostname='hostA', healthChecks=[hcheck]) api_mock = mock.Mock() cluster_mock = mock.Mock() cluster_mock.get_all_services = mock.Mock(return_value=[service1, service2, service3]) api_mock.get_host = mock.Mock(return_value=host) cdhdata = CDHData(api_mock, cluster_mock) self.assertEqual(cdhdata.get_hive_endpoint(), 'hostA') self.assertEqual(cdhdata.get_hbase_endpoint(), 'hostA') self.assertEqual(cdhdata.get_impala_endpoint(), 'hostA') indicators = cdhdata.get_status_indicators() self.assertEqual(len(indicators), 3) for indicator in indicators: self.assertTrue(indicator.source in ['hive01', 'hbase01', 'impala01']) if indicator.source == 'hbase01': self.assertEqual(indicator.metric, 'hadoop.HBASE.cm_indicator') self.assertEqual(indicator.value, 'WARN') if indicator.source == 'hive01': self.assertEqual(indicator.metric, 'hadoop.HIVE.cm_indicator') self.assertEqual(indicator.value, 'WARN') if indicator.source == 'impala01': self.assertEqual(indicator.metric, 'hadoop.IMPALA.cm_indicator') self.assertEqual(indicator.value, 'ERROR')
def analyse_results(self, zk_data, test_result): ''' Analyse the partition summary and Prod2Cons Then set the the test result flag accordingly I the test flag is not green, put a reason explaining why Then return a json ''' analyse_status = MonitorStatus["green"] analyse_causes = [] analyse_metric = 'kafka.health' zk_majority = int(math.ceil( float(len(zk_data.list_zk.split(","))) / 2)) if zk_data and zk_data.list_zk_ko: if zk_data.num_zk_ok >= zk_majority: LOGGER.warn( "analyse_results : at least one zookeeper node failed") analyse_status = MonitorStatus["amber"] analyse_causes.append("zookeeper node(s) unreachable (%s)" % zk_data.list_zk_ko) else: LOGGER.error( "analyse_results : at least one zookeeper node failed") analyse_status = MonitorStatus["red"] analyse_causes.append("zookeeper node(s) unreachable (%s)" % zk_data.list_zk_ko) if zk_data and zk_data.list_brokers_ko: LOGGER.error("analyse_results : at least one broker failed") analyse_status = MonitorStatus["red"] analyse_causes.append("broker(s) unreachable (%s)" % zk_data.list_brokers_ko) if zk_data and zk_data.num_part_ko > 0: LOGGER.error( "analyse_results : at least one topic / partition inconsistency" ) if analyse_status != MonitorStatus["red"]: analyse_status = MonitorStatus["amber"] analyse_causes.append( "topic / partition inconsistency in zookeeper") if self.prod2cons: if test_result.sent == test_result.received \ and test_result.notvalid == 0: LOGGER.debug( "analyse_results - test for messages sent / received is valid" ) else: LOGGER.error( "analyse_results - test for messages sent / received failed" ) analyse_status = MonitorStatus["red"] analyse_causes.append("producer / consumer failed " + \ "(sent %d, rcv_ok %d, rcv_ko %d)" % (test_result.sent, test_result.received, test_result.notvalid)) # whitebox analysis if self.whitebox_error_code != -1: if self.whitebox_error_code == 101: LOGGER.warn("UnderReplicatedPartitions should be 0") if analyse_status != MonitorStatus["red"]: analyse_status = MonitorStatus["amber"] analyse_causes.append("UnderReplicatedPartitions should be 0") elif self.whitebox_error_code == 102: LOGGER.warn( "ActiveControllerCount only one broker in the cluster should have 1" ) if analyse_status != MonitorStatus["red"]: analyse_status = MonitorStatus["amber"] analyse_causes.append( "ActiveControllerCount only one broker in the cluster should have 1" ) elif self.whitebox_error_code == 104: LOGGER.warn( "analyse_results : Unclean leader election rate, should be 0" ) if analyse_status != MonitorStatus["red"]: analyse_status = MonitorStatus["amber"] analyse_causes.append( "Unclean leader election rate, should be 0") return Event( TIMESTAMP_MILLIS(), 'kafka', \ analyse_metric, analyse_causes, analyse_status)
def process(self, zknodes, gbrokers, partitions): ''' Returns a named tuple of type PartitionsSummary. ''' LOGGER.debug("process started") topic_ok = 0 topic_ko = 0 process_results = [] for obj in partitions: parts_object = obj.partitions["list"] if obj.partitions["valid"] is True: for parts in parts_object: # Get the partition leader for part, partinfo in parts.iteritems(): leader_read = partinfo['leader'] broker = get_broker_by_id(gbrokers, '%d' % leader_read) if broker is not None: process_results.append( PartitionState(broker.host, broker.port, obj.id, part, obj.partitions["valid"])) topic_ok += 1 else: topic_ko += 1 LOGGER.error("Topic not in a good state (%s)", obj.id) process_results.append( PartitionState(None, None, obj.id, None, obj.partitions["valid"])) self.results.append( Event(TIMESTAMP_MILLIS(), 'kafka', 'kafka.nodes', [], gbrokers.connect)) self.results.append( Event(TIMESTAMP_MILLIS(), 'kafka', 'kafka.nodes.ok', [], gbrokers.num_ok)) self.results.append( Event(TIMESTAMP_MILLIS(), 'kafka', 'kafka.nodes.ko', [], gbrokers.num_ko)) self.results.append( Event(TIMESTAMP_MILLIS(), 'kafka', 'kafka.partitions.ok', [], topic_ok)) self.results.append( Event(TIMESTAMP_MILLIS(), 'kafka', 'kafka.partitions.ko', [], topic_ko)) LOGGER.debug("process finished") return MonitorSummary(num_partitions=len(process_results), list_brokers=gbrokers.connect, list_brokers_ko=gbrokers.error, num_brokers_ok=gbrokers.num_ok, num_brokers_ko=gbrokers.num_ko, list_zk=zknodes.connect, list_zk_ko=zknodes.error, num_zk_ok=zknodes.num_ok, num_zk_ko=zknodes.num_ko, num_part_ok=topic_ok, num_part_ko=topic_ko, partitions=tuple(process_results))
def update(self): ''' Retrieve endpoint metadata & overall health indicators from Ambari plus any reason codes Returns sequence of Event tuples with metrics taking the form of hadoop.%s.cm_indicator ''' self._values = [] self._metadata = {'names': {}, 'types': {}} def get_health_state(alert_state): ''' Convert alert state to health state ''' if alert_state == 'CRITICAL': return "ERROR" elif alert_state == 'WARNING': return "WARN" return "OK" # get cluster name cluster_uri = requests.get( '%s/clusters' % self._ambari_api, auth=self._http_auth, headers=self._http_headers).json()['items'][0]['href'] # get all alerts and aggregate a health summary from the alert list alerts = requests.get( '%s/alerts?fields=Alert/component_name,Alert/text,Alert/' 'label,Alert/state&Alert/maintenance_state.in(OFF)' % cluster_uri, auth=self._http_auth, headers=self._http_headers).json()['items'] self._metadata['names']['HQUERY'] = 'HQUERY' self._metadata['types']['HQUERY'] = 'HQUERY' service_health_store = {} service_health_causes = {} for alert_item in alerts: alert_info = alert_item['Alert'] service_name = alert_info['service_name'] if service_name == 'SPARK': service_type = 'SPARK_ON_YARN' elif service_name == 'AMBARI': service_type = 'CLUSTER_MANAGER' elif service_name == 'AMBARI_METRICS': service_name = 'AMBARI' service_type = 'CLUSTER_MANAGER' else: service_type = service_name self._metadata['names'][service_type] = service_name self._metadata['types'][service_name] = service_type current_health = service_health_store[ service_name] if service_name in service_health_store else 'OK' new_health = get_health_state(alert_info['state']) updated_health = self._update_health(current_health, new_health) service_health_store[service_name] = updated_health if new_health in ['ERROR', 'WARN']: current_causes = service_health_causes[service_name] if service_name in service_health_causes else [] current_causes.append( '%s: %s - %s' % (alert_info['host_name'], alert_info['label'], alert_info['text'])) service_health_causes[service_name] = current_causes # Write out an event for each service for service_name in service_health_store: self._values.append( Event( TIMESTAMP_MILLIS(), service_name, "hadoop.%s.cm_indicator" % self._metadata['types'][service_name], list( set(service_health_causes[service_name] if service_name in service_health_causes else [])), service_health_store[service_name])) # Grab endpoints used by other tests query = 'fields=host_components' self._metadata['hbase_endpoint'] = requests.get( '%s/services/HBASE/components/HBASE_MASTER?%s' % (cluster_uri, query), auth=self._http_auth, headers=self._http_headers).json( )['host_components'][0]['HostRoles']['host_name'] self._metadata['hive_endpoint'] = requests.get( '%s/services/HIVE/components/HIVE_SERVER?%s' % (cluster_uri, query), auth=self._http_auth, headers=self._http_headers).json( )['host_components'][0]['HostRoles']['host_name'] self._metadata['impala_endpoint'] = None
def runner(self, args, display=True): ''' Main section. ''' plugin_args = args.split() \ if args is not None and (len(args.strip()) > 0) \ else "" options = self.read_args(plugin_args) values = [] try: start = TIMESTAMP_MILLIS() with eventlet.Timeout(100): req = requests.get("%s/repository/packages" % (options.dmendpoint), timeout=20) end = TIMESTAMP_MILLIS() packages_available_ok = True packages_available_count = len(req.json()) packages_available_ms = end-start values.append(Event(TIMESTAMP_MILLIS(), "deployment-manager", \ "deployment-manager.packages_available_time_ms", \ [], packages_available_ms)) values.append(Event(TIMESTAMP_MILLIS(), "deployment-manager", \ "deployment-manager.packages_available_count", \ [], packages_available_count)) except Exception: packages_available_ok = False values.append(Event(TIMESTAMP_MILLIS(), "deployment-manager", \ "deployment-manager.packages_available_succeeded", \ [], packages_available_ok)) try: start = TIMESTAMP_MILLIS() with eventlet.Timeout(100): req = requests.get("%s/packages" \ % (options.dmendpoint), timeout=20) end = TIMESTAMP_MILLIS() packages_deployed_ok = True packages_deployed_count = len(req.json()) packages_deployed_ms = end-start values.append(Event(TIMESTAMP_MILLIS(), "deployment-manager", \ "deployment-manager.packages_deployed_time_ms", \ [], packages_deployed_ms)) values.append(Event(TIMESTAMP_MILLIS(), 'deployment-manager', \ "deployment-manager.packages_deployed_count", \ [], packages_deployed_count)) except Exception: packages_deployed_ok = False values.append(Event(TIMESTAMP_MILLIS(), 'deployment-manager', \ "deployment-manager.packages_deployed_succeeded", \ [], packages_deployed_ok)) cause = "" health = "OK" if not packages_available_ok or not packages_deployed_ok: health = "ERROR" cause = "Deployment manager package APIs are not working" values.append(Event(TIMESTAMP_MILLIS(), 'deployment-manager', 'deployment-manager.health', [cause], health)) if display: self._do_display(values) return values
def update(self): ''' Retrieve endpoint metadata & overall health indicators from CM plus any reason codes Returns sequence of Event tuples with metrics taking the form of hadoop.%s.cm_indicator ''' self._values = [] self._metadata = {'names': {}, 'types': {}} def is_bad(summary): ''' Designated 'bad' status results ''' return summary in ["BAD", "CONCERNING", "ERROR", "WARN"] def get_causes(health_checks): ''' Extract causes from health check results ''' return [ "%s%s" % (chk['name'], ":" + chk['explanation'] if 'explanation' in chk.keys() else '') for chk in health_checks if is_bad(chk['summary']) ] def update_health(current, updated): ''' Given current health and and an update return new current health ''' updated_health = current if current != 'ERROR' and (updated == 'CONCERNING' or updated == 'WARN'): updated_health = 'WARN' elif updated == 'BAD' or updated == 'ERROR': updated_health = 'ERROR' return updated_health # Main body of function - single pass over all services picking up endpoints, # health of each service and causes in the case of poor health for service in self._cluster.get_all_services(): self._metadata['names'][service.type] = service.name self._metadata['types'][service.name] = service.type service_health = update_health('OK', service.healthSummary) causes = get_causes(service.healthChecks) for role in service.get_all_roles(): if role.type == "HBASETHRIFTSERVER": self._metadata['hbase_endpoint'] = \ self._api.get_host(role.hostRef.hostId).hostname if role.type == "HIVESERVER2": self._metadata['hive_endpoint'] = \ self._api.get_host(role.hostRef.hostId).hostname if role.type == "IMPALAD": self._metadata['impala_endpoint'] = \ self._api.get_host(role.hostRef.hostId).hostname host = self._api.get_host(role.hostRef.hostId) causes.extend( get_causes(self._api.get_host(host.hostId).healthChecks)) causes.extend(get_causes(role.healthChecks)) self._values.append( Event(TIMESTAMP_MILLIS(), service.name, "hadoop.%s.cm_indicator" % service.type, list(set(causes)), service_health))
def runner(self, args, display=True): ''' Main section. ''' plugin_args = args.split() \ if args is not None and args.strip() \ else "" options = self.read_args(plugin_args) cause = [] values = [] packages_available_ok, packages_deployed_ok = False, False packages_available_count, packages_deployed_count = -1, -1 packages_available_ms, packages_deployed_ms = -1, -1 # noinspection PyBroadException try: path = '/repository/packages' start = TIMESTAMP_MILLIS() with eventlet.Timeout(100): req = requests.get("%s%s" % (options.dmendpoint, path), timeout=20) end = TIMESTAMP_MILLIS() packages_available_ms = end - start status, msg = DMBlackBox.validate_api_response(req, path) if status == 'SUCCESS': packages_available_ok = True packages_available_count = len(req.json()) else: cause.append(msg) except RequestException: cause.append( 'Unable to connect to the Deployment Manager (request path = {})' .format(path)) except Exception as e: cause.append('Platform Testing Client Error- ' + str(e)) # noinspection PyBroadException try: path = '/packages' start = TIMESTAMP_MILLIS() with eventlet.Timeout(100): req = requests.get("%s%s" % (options.dmendpoint, path), timeout=20) end = TIMESTAMP_MILLIS() packages_deployed_ms = end - start status, msg = DMBlackBox.validate_api_response(req, path) if status == 'SUCCESS': packages_deployed_ok = True packages_deployed_count = len(req.json()) else: cause.append(msg) except RequestException: cause.append( 'Unable to connect to the Deployment Manager (request path = {})' .format(path)) except Exception as e: cause.append('Platform Testing Client Error- ' + str(e)) values.append( Event(TIMESTAMP_MILLIS(), "deployment-manager", "deployment-manager.packages_available_time_ms", [], packages_available_ms)) values.append( Event(TIMESTAMP_MILLIS(), "deployment-manager", "deployment-manager.packages_available_succeeded", [], packages_available_ok)) values.append( Event(TIMESTAMP_MILLIS(), "deployment-manager", "deployment-manager.packages_available_count", [], packages_available_count)) values.append( Event(TIMESTAMP_MILLIS(), "deployment-manager", "deployment-manager.packages_deployed_time_ms", [], packages_deployed_ms)) values.append( Event(TIMESTAMP_MILLIS(), 'deployment-manager', "deployment-manager.packages_deployed_succeeded", [], packages_deployed_ok)) values.append( Event(TIMESTAMP_MILLIS(), 'deployment-manager', "deployment-manager.packages_deployed_count", [], packages_deployed_count)) health = "OK" if not packages_available_ok or not packages_deployed_ok: health = "ERROR" values.append( Event(TIMESTAMP_MILLIS(), 'deployment-manager', 'deployment-manager.health', cause, health)) if display: self._do_display(values) return values
def runner(self, args, display=True): """ Main section. """ plugin_args = args.split() \ if args is not None and args.strip() \ else "" options = self.read_args(plugin_args) cause = [] values = [] hs_available_success, hs_completed_jobs_success = False, False hs_available_ms, hs_completed_jobs_ms = -1, -1 installed_flink_version, completed_job_count = '', -1 # noinspection PyBroadException try: path = '/config' start = TIMESTAMP_MILLIS() with eventlet.Timeout(100): req = requests.get("%s%s" % (options.fhendpoint, path), timeout=20) end = TIMESTAMP_MILLIS() hs_available_ms = end - start status, msg = Flink.validate_api_response(req, path) if status == 'SUCCESS': installed_flink_version = json.loads(req.text).get( "flink-version", '') hs_available_success = True else: cause.append(msg) except RequestException: cause.append( 'Unable to connect to the Flink History Server (request path = {})' .format(path)) except Exception as except_obj: cause.append('Platform Testing Client Error- ' + str(except_obj)) # noinspection PyBroadException try: path = '/joboverview' start = TIMESTAMP_MILLIS() with eventlet.Timeout(100): req = requests.get("%s%s" % (options.fhendpoint, path), timeout=20) end = TIMESTAMP_MILLIS() hs_completed_jobs_ms = end - start # 404 - added to the expected response codes because, # Flink history server return 404, unless at least one flink job is executed. status, msg = Flink.validate_api_response(req, path, [404]) if status == 'SUCCESS': if req.status_code == 200: completed_job_count = len( json.loads(req.text).get('finished')) elif req.status_code == 404: completed_job_count = 0 hs_completed_jobs_success = True else: cause.append(msg) except RequestException: cause.append( 'Unable to connect to the Flink History Server (request path = {})' .format(path)) except Exception as except_obj: cause.append('Platform Testing Client Error- ' + str(except_obj)) values.append( Event(TIMESTAMP_MILLIS(), "flink", "flink.history_server_available_success", [], hs_available_success)) values.append( Event(TIMESTAMP_MILLIS(), "flink", "flink.installed_flink_version", [], installed_flink_version)) values.append( Event(TIMESTAMP_MILLIS(), "flink", "flink.history_server_available_ms", [], hs_available_ms)) values.append( Event(TIMESTAMP_MILLIS(), "flink", "flink.history_server_completed_jobs_success", [], hs_completed_jobs_success)) values.append( Event(TIMESTAMP_MILLIS(), "flink", "flink.history_server_completed_jobs_count", [], completed_job_count)) values.append( Event(TIMESTAMP_MILLIS(), "flink", "flink.history_server_completed_jobs_ms", [], hs_completed_jobs_ms)) health = "OK" if not hs_available_success or not hs_completed_jobs_success: health = "ERROR" values.append( Event(TIMESTAMP_MILLIS(), 'flink', 'flink.health', cause, health)) if display: self._do_display(values) return values
.rstrip('\r\n') if zkelect == "leader" or zkelect == "standalone": zk_election = True self.results.append( Event(TIMESTAMP_MILLIS(), 'zookeeper', 'zookeeper.%d.mode' % (zid), [], zkelect)) except ZkError, ex: LOGGER.error('Failed to access Zookeeper: %s', str(ex)) break except ProcessorError, ex: LOGGER.error('Failed to process: %s', str(ex)) break else: self.results.append( Event(TIMESTAMP_MILLIS(), 'zookeeper', 'zookeeper.%d.mode' % (zid), [], MonitorStatus["red"])) zid += 1 if not zk_data: zk_data = ZkMonitorSummary(list_zk=self.zconnect, list_zk_ko=self.zconnect, num_zk_ok=0, num_zk_ko=len(zknodes)) # ---------------------------------------- # Lets'build the global result structure # ---------------------------------------- results_summary = analyse_results(zk_data, zk_election) # ---------------------------------------- # if output display is required # ----------------------------------------