示例#1
0
        def default_health_value(name, service, operation, failed_step):
            result = False
            if len([event for event in health_values if event.metric == name]) == 0:
                if failed_step is not None:
                    message = 'Did not attempt to %s due to timeout waiting for: %s' % (operation, failed_step)
                else:
                    message = 'Timed out waiting for %s to complete' % operation

                health_values.append(Event(TIMESTAMP_MILLIS(),
                                           cdh.get_name(service),
                                           name,
                                           [message],
                                           False))
                result = True
            return result
    def get_uncleanleaderelections(self, host, broker_id):
        '''
        Get UncleanLeaderElectionsPerSec
        '''
        unclean_count = None
        unclean_rate = None
        for jmx_data in [
                "RateUnit", "OneMinuteRate", "EventType", "Count",
                "FifteenMinuteRate", "FiveMinuteRate", "MeanRate"
        ]:
            url_jmxproxy = ("http://127.0.0.1:8000/jmxproxy/%s/"
                            "kafka.controller:type=ControllerStats,"
                            "name=UncleanLeaderElectionsPerSec/%s") % (
                                host, jmx_data)

            response = requests.get(url_jmxproxy)
            if response.status_code == 200:
                LOGGER.debug("Getting %s fo %s", response.text, url_jmxproxy)
                self.results.append(
                    Event(TIMESTAMP_MILLIS(), 'kafka',
                          ('kafka.brokers.%d.'
                           'controllerstats.UncleanLeaderElections.%s') %
                          (broker_id, jmx_data), [], response.text))

                if jmx_data == "Count":
                    unclean_count = int(response.text)
                elif jmx_data == "FifteenMinuteRate":
                    unclean_rate = Decimal(response.text)

            else:
                LOGGER.error("ERROR for url_jmxproxy: %s", url_jmxproxy)
        if unclean_count is not None and unclean_rate is not None:
            if unclean_rate > 0.0002:
                LOGGER.debug(
                    "broker %d threshold is %f and current rate is %f",
                    broker_id, 0.0002, unclean_rate)
                self.whitebox_error_code = 104

        return None
示例#3
0
    def get_underreplicatedpartitions(self, host, broker_id):
        '''
        Get underreplicatedpartitions
        '''
        url_jmxproxy = ("http://127.0.0.1:8000/jmxproxy/%s/"
                        "kafka.server:type=ReplicaManager,"
                        "name=UnderReplicatedPartitions/Value") % host

        response = requests.get(url_jmxproxy)
        if response.status_code == 200:
            LOGGER.debug("Getting %s fo %s", response.text, url_jmxproxy)
            self.results.append(
                Event(TIMESTAMP_MILLIS(), 'kafka',
                      'kafka.brokers.%d.UnderReplicatedPartitions' % broker_id,
                      [], response.text))
            if response.text != "0":
                self.whitebox_error_code = 101

        else:
            LOGGER.error("ERROR for url_jmxproxy: %s", url_jmxproxy)

        return None
示例#4
0
def analyse_results(zk_data, zk_election):
    '''
    Analyse the partition summary and Prod2Cons
    Then set the the test result flag accordingly
    I the test flag is not green, put a reason explaining why
    Then return a json
    '''
    analyse_status = MonitorStatus["green"]
    analyse_causes = []
    analyse_metric = 'zookeeper.health'

    if zk_data and len(zk_data.list_zk_ko) > 0:
        LOGGER.error("analyse_results : at least one zookeeper node failed")
        analyse_status = MonitorStatus["red"]
        analyse_causes.append("zookeeper node(s) unreachable (%s)" %
                              zk_data.list_zk_ko)
    elif zk_election is False:
        LOGGER.error(
            "analyse_results : zookeeper election not done, check nodes mode")
        analyse_status = MonitorStatus["red"]
        analyse_causes.append("zookeeper election not done, check nodes mode")
    return Event(TIMESTAMP_MILLIS(), 'zookeeper', analyse_metric,
                 analyse_causes, analyse_status)
    def get_activecontrollercount(self, host, broker_id):
        '''
        Get activecontrollercount
        '''
        url_jmxproxy = ("http://127.0.0.1:8000/jmxproxy/%s/"
                        "kafka.controller:type=KafkaController,"
                        "name=ActiveControllerCount/Value") % host

        response = requests.get(url_jmxproxy)
        if response.status_code == 200:
            LOGGER.debug("Getting %s fo %s", response.text, url_jmxproxy)
            self.results.append(
                Event(TIMESTAMP_MILLIS(), 'kafka',
                      'kafka.brokers.%d.ActiveControllerCount' % broker_id, [],
                      response.text))
            if self.activecontrollercount != -1 and response.text == 1:
                self.activecontrollercount = 1
            elif self.activecontrollercount == 1 and response.text == 1:
                self.whitebox_error_code = 102

        else:
            LOGGER.error("ERROR for url_jmxproxy: %s", url_jmxproxy)

        return None
示例#6
0
    def runner(self, args, display=True):
        values = []
        health_values = []

        plugin_args = args.split() \
                    if args is not None and (len(args.strip()) > 0) \
                    else ""

        options = self.read_args(plugin_args)

        if options.hadoopdistro == 'CDH':
            api = ApiResource(server_host=options.cmhost, \
                            server_port=options.cmport, \
                            username=options.cmuser, \
                            password=options.cmpassword, \
                            version=11)
            cluster = api.get_cluster(api.get_all_clusters()[0].name)
            cdh = CDHData(api, cluster)
        else:
            cdh = HDPData(options.cmhost, options.cmuser, options.cmpassword)
        hbase = None

        def run_test_sequence():
            # pylint: disable=too-many-return-statements
            hbase = happybase.Connection(host=cdh.get_hbase_endpoint())
            if abort_test_sequence is True:
                return
            reason = []
            try:
                start = TIMESTAMP_MILLIS()

                try:
                    hbase.create_table('blackbox_test_table', {'cf': dict()})
                    logging.debug("test table created")
                except AlreadyExists:
                    logging.debug("test table exists")

                table = hbase.table('blackbox_test_table')
                end = TIMESTAMP_MILLIS()
                create_table_ok = True
                create_table_ms = end - start
                values.append(
                    Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'),
                          "hadoop.HBASE.create_table_time_ms", [],
                          create_table_ms))
            except:
                LOGGER.error(traceback.format_exc())
                create_table_ok = False
                reason = ['Create HBase table operation failed']
            health_values.append(
                Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'),
                      "hadoop.HBASE.create_table_succeeded", reason,
                      create_table_ok))

            #write some data to it
            if abort_test_sequence is True:
                return
            reason = []
            try:
                start = TIMESTAMP_MILLIS()
                table.put('row_key', {'cf:column': 'value'})
                end = TIMESTAMP_MILLIS()
                write_hbase_ok = True
                write_hbase_ms = end - start
                values.append(
                    Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'),
                          "hadoop.HBASE.write_time_ms", [], write_hbase_ms))
            except:
                LOGGER.error(traceback.format_exc())
                write_hbase_ok = False
                reason = ['Failed to insert row in HBase table']
            health_values.append(
                Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'),
                      "hadoop.HBASE.write_succeeded", reason, write_hbase_ok))

            #read some data from it
            if abort_test_sequence is True:
                return
            reason = []
            try:
                start = TIMESTAMP_MILLIS()
                row = table.row('row_key', columns=['cf:column'])
                end = TIMESTAMP_MILLIS()
                read_hbase_ms = end - start
                read_hbase_ok = row['cf:column'] == 'value'
                values.append(
                    Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'),
                          "hadoop.HBASE.read_time_ms", [], read_hbase_ms))
            except:
                LOGGER.error(traceback.format_exc())
                hbase_fix_output = subprocess.check_output([
                    'sudo', '-u', 'hbase', 'hbase', 'hbck', '-repair',
                    'blackbox_test_table'
                ])
                for line in hbase_fix_output.splitlines():
                    if 'Status:' in line or 'inconsistencies detected' in line:
                        LOGGER.debug(line)
                subprocess.check_output([
                    'sudo', '-u', 'hbase', 'hbase', 'zkcli', 'rmr',
                    '/hbase/table/blackbox_test_table'
                ])
                subprocess.check_output([
                    'sudo', '-u', 'hdfs', 'hadoop', 'fs', '-rm', '-r', '-f',
                    '-skipTrash', '/hbase/data/default/blackbox_test_table'
                ])
                read_hbase_ok = False
                reason = ['Failed to fetch row by row key from HBase']
            health_values.append(
                Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'),
                      "hadoop.HBASE.read_succeeded", reason, read_hbase_ok))

            #create some hive metadata
            reason = []
            if abort_test_sequence is True:
                return
            try:
                start = TIMESTAMP_MILLIS()
                hive = hive_api.connect(cdh.get_hive_endpoint())
                end = TIMESTAMP_MILLIS()
                hive.cursor().execute("DROP TABLE blackbox_test_table")
                connect_to_hive_ms = end - start
                connect_to_hive_ok = True
                values.append(
                    Event(TIMESTAMP_MILLIS(), cdh.get_name('HIVE'),
                          "hadoop.HIVE.connection_time_ms", [],
                          connect_to_hive_ms))
            except:
                LOGGER.error(traceback.format_exc())
                connect_to_hive_ok = False
                reason = ['Failed to connect to Hive Metastore']
            health_values.append(
                Event(TIMESTAMP_MILLIS(), cdh.get_name('HIVE'),
                      "hadoop.HIVE.connection_succeeded", reason,
                      connect_to_hive_ok))

            if abort_test_sequence is True:
                return
            reason = []
            try:
                start = TIMESTAMP_MILLIS()
                hive.cursor().execute((
                    "CREATE EXTERNAL TABLE "
                    "blackbox_test_table (key STRING, value STRING)"
                    "STORED BY \"org.apache.hadoop.hive.hbase.HBaseStorageHandler\" "
                    "WITH SERDEPROPERTIES "
                    "(\"hbase.columns.mapping\" = \":key,cf:column\") "
                    "TBLPROPERTIES(\"hbase.table.name\" = \"blackbox_test_table\")"
                ))
                end = TIMESTAMP_MILLIS()
                create_metadata_ms = end - start
                create_metadata_ok = True
                values.append(
                    Event(TIMESTAMP_MILLIS(), cdh.get_name('HIVE'),
                          "hadoop.HIVE.create_metadata_time_ms", [],
                          create_metadata_ms))
            except:
                LOGGER.error(traceback.format_exc())
                create_metadata_ok = False
                reason = [
                    'CREATE EXTERNAL TABLE statement failed on Hive Metastore'
                ]
            health_values.append(
                Event(TIMESTAMP_MILLIS(), cdh.get_name('HIVE'),
                      "hadoop.HIVE.create_metadata_succeeded", reason,
                      create_metadata_ok))

            #read some data via impala using it
            if abort_test_sequence is True:
                return

            if cdh.get_impala_endpoint() is not None:
                reason = []
                try:
                    start = TIMESTAMP_MILLIS()
                    impala = connect(host=cdh.get_impala_endpoint(),
                                     port=options.impalaport)
                    end = TIMESTAMP_MILLIS()
                    impala.cursor().execute("invalidate metadata")
                    connect_to_impala_ms = end - start
                    connect_to_impala_ok = True
                    values.append(
                        Event(TIMESTAMP_MILLIS(), cdh.get_name('IMPALA'),
                              "hadoop.IMPALA.connection_time_ms", [],
                              connect_to_impala_ms))
                except:
                    LOGGER.error(traceback.format_exc())
                    connect_to_impala_ok = False
                    reason = ['Failed to connect to Impala']
                health_values.append(
                    Event(TIMESTAMP_MILLIS(), cdh.get_name('IMPALA'),
                          "hadoop.IMPALA.connection_succeeded", reason,
                          connect_to_impala_ok))

                if abort_test_sequence is True:
                    return
                reason = []
                try:
                    start = TIMESTAMP_MILLIS()
                    impala_cursor = impala.cursor()
                    impala_cursor.execute("SELECT * FROM blackbox_test_table")
                    table_contents = impala_cursor.fetchall()
                    end = TIMESTAMP_MILLIS()
                    read_impala_ms = end - start
                    read_impala_ok = table_contents[0][1] == 'value'
                    values.append(
                        Event(TIMESTAMP_MILLIS(), cdh.get_name('IMPALA'),
                              "hadoop.IMPALA.read_time_ms", [],
                              read_impala_ms))
                except:
                    LOGGER.error(traceback.format_exc())
                    read_impala_ok = False
                    reason = ['Failed to SELECT from Impala']
                health_values.append(
                    Event(TIMESTAMP_MILLIS(), cdh.get_name('IMPALA'),
                          "hadoop.IMPALA.read_succeeded", reason,
                          read_impala_ok))
            else:
                reason = []
                try:
                    start = TIMESTAMP_MILLIS()
                    hive_cursor = hive.cursor()
                    hive_cursor.execute("SELECT * FROM blackbox_test_table")
                    table_contents = hive_cursor.fetchall()
                    end = TIMESTAMP_MILLIS()
                    read_hive_ms = end - start
                    read_hive_ok = table_contents[0][1] == 'value'
                    values.append(
                        Event(TIMESTAMP_MILLIS(), cdh.get_name('HQUERY'),
                              "hadoop.HQUERY.read_time_ms", [], read_hive_ms))
                except:
                    LOGGER.error(traceback.format_exc())
                    read_hive_ok = False
                    reason = ['Failed to SELECT from Hive']
                health_values.append(
                    Event(TIMESTAMP_MILLIS(), cdh.get_name('HQUERY'),
                          "hadoop.HQUERY.read_succeeded", reason,
                          read_hive_ok))

            #delete metadata
            if abort_test_sequence is True:
                return
            reason = []
            try:
                start = TIMESTAMP_MILLIS()
                hive.cursor().execute("DROP TABLE blackbox_test_table")
                end = TIMESTAMP_MILLIS()
                drop_metadata_ms = end - start
                drop_metadata_ok = True
                values.append(
                    Event(TIMESTAMP_MILLIS(), cdh.get_name('HIVE'),
                          "hadoop.HIVE.drop_table_time_ms", [],
                          drop_metadata_ms))
            except:
                LOGGER.error(traceback.format_exc())
                drop_metadata_ok = False
                reason = ['Failed to DROP table in Hive Metastore']
            health_values.append(
                Event(TIMESTAMP_MILLIS(), cdh.get_name('HIVE'),
                      "hadoop.HIVE.drop_table_succeeded", reason,
                      drop_metadata_ok))

            #delete hbase table
            if abort_test_sequence is True:
                return
            reason = []
            try:
                start = TIMESTAMP_MILLIS()
                # Disabled deleting table to work around apparent hbase bug (see VPP-17) but leaving
                # test step in so it can be easily re-enabled for testing.
                #hbase.disable_table('blackbox_test_table')
                #hbase.delete_table('blackbox_test_table')
                end = TIMESTAMP_MILLIS()
                drop_table_ms = end - start
                drop_table_ok = True
                values.append(
                    Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'),
                          "hadoop.HBASE.drop_table_time_ms", [],
                          drop_table_ms))
            except:
                LOGGER.error(traceback.format_exc())
                drop_table_ok = False
                reason = ['Failed to drop table in HBase']
            health_values.append(
                Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'),
                      "hadoop.HBASE.drop_table_succeeded", reason,
                      drop_table_ok))

        def to_status(flag):
            '''
            Convert True to OK and False to ERROR
            '''
            if flag in [True, False]:
                status = 'OK' if flag is True else 'ERROR'
            else:
                status = flag

            return status

        def default_health_value(name, service, operation, failed_step):
            result = False
            if len([event for event in health_values
                    if event.metric == name]) == 0:
                if failed_step is not None:
                    message = 'Did not attempt to %s due to timeout waiting for: %s' % (
                        operation, failed_step)
                else:
                    message = 'Timed out waiting for %s to complete' % operation

                health_values.append(
                    Event(TIMESTAMP_MILLIS(), cdh.get_name(service), name,
                          [message], False))
                result = True
            return result

        test_thread = threading.Thread(target=run_test_sequence)
        test_thread.daemon = True
        abort_test_sequence = False
        test_thread.start()
        test_thread.join(60.0)
        abort_test_sequence = True
        if hbase is not None:
            hbase.close()

        failed_step = None
        if default_health_value("hadoop.HBASE.create_table_succeeded", "HBASE",
                                "create HBase table",
                                failed_step) and failed_step is None:
            failed_step = "create HBase table"
        if default_health_value("hadoop.HBASE.write_succeeded", "HBASE",
                                "write to HBase",
                                failed_step) and failed_step is None:
            failed_step = "write to HBase"
        if default_health_value("hadoop.HBASE.read_succeeded", "HBASE",
                                "read from HBase",
                                failed_step) and failed_step is None:
            failed_step = "read from HBase"
        if default_health_value("hadoop.HIVE.connection_succeeded", "HIVE",
                                "connect to Hive Metastore",
                                failed_step) and failed_step is None:
            failed_step = "connect to Hive Metastore"
        if default_health_value("hadoop.HIVE.create_metadata_succeeded",
                                "HIVE", "create Hive Metastore table",
                                failed_step) and failed_step is None:
            failed_step = "create Hive Metastore table"
        if cdh.get_impala_endpoint() is not None:
            if default_health_value("hadoop.IMPALA.connection_succeeded",
                                    "IMPALA", "connect to Impala",
                                    failed_step) and failed_step is None:
                failed_step = "connect to Impala"
            if default_health_value("hadoop.IMPALA.read_succeeded", "IMPALA",
                                    "SELECT from Impala",
                                    failed_step) and failed_step is None:
                failed_step = "SELECT from Impala"
        else:
            if default_health_value("hadoop.HQUERY.read_succeeded", "HQUERY",
                                    "SELECT from Hive",
                                    failed_step) and failed_step is None:
                failed_step = "SELECT from Hive"
        if default_health_value("hadoop.HIVE.drop_table_succeeded", "HIVE",
                                "DROP table in Hive Metastore",
                                failed_step) and failed_step is None:
            failed_step = "DROP table in Hive Metastore"
        if default_health_value("hadoop.HBASE.drop_table_succeeded", "HBASE",
                                "drop table in HBase",
                                failed_step) and failed_step is None:
            failed_step = "drop table in HBase"

        cdh_status_indicators = cdh.get_status_indicators()
        health_values.extend(cdh_status_indicators)
        overall = {}
        for health_val in health_values:
            try:
                current = overall[health_val.source]
                current_val = to_status(current.value)
                current_causes = current.causes
            except KeyError:
                current_val = 'OK'
                current_causes = []

            update = to_status(health_val.value)

            # If current is ERROR, output is ERROR, regardless
            # If current is WARN, output is WARN if update is OK but ERROR if further WARN or ERROR
            # If update is OK, output is OK if OK, WARN if WARN and ERROR if ERROR

            out = 'ERROR'
            if current_val != "ERROR":
                if current_val == 'WARN':
                    if update == 'OK':
                        out = 'WARN'
                if current_val == 'OK':
                    out = update
            current_val = out
            current_causes.extend(health_val.causes)

            overall[health_val.source] = Event(
                health_val.timestamp, health_val.source,
                'hadoop.%s.health' % cdh.get_type(health_val.source),
                current_causes, current_val)

        values.extend(health_values)
        values.extend(overall.values())

        if display:
            self._do_display(values)

        return values
示例#7
0
        def run_test_sequence():
            # pylint: disable=too-many-return-statements
            hbase = happybase.Connection(host=cdh.get_hbase_endpoint())
            if abort_test_sequence is True:
                return
            reason = []
            try:
                start = TIMESTAMP_MILLIS()

                try:
                    hbase.create_table('blackbox_test_table', {'cf': dict()})
                    logging.debug("test table created")
                except AlreadyExists:
                    logging.debug("test table exists")

                table = hbase.table('blackbox_test_table')
                end = TIMESTAMP_MILLIS()
                create_table_ok = True
                create_table_ms = end - start
                values.append(
                    Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'),
                          "hadoop.HBASE.create_table_time_ms", [],
                          create_table_ms))
            except:
                LOGGER.error(traceback.format_exc())
                create_table_ok = False
                reason = ['Create HBase table operation failed']
            health_values.append(
                Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'),
                      "hadoop.HBASE.create_table_succeeded", reason,
                      create_table_ok))

            #write some data to it
            if abort_test_sequence is True:
                return
            reason = []
            try:
                start = TIMESTAMP_MILLIS()
                table.put('row_key', {'cf:column': 'value'})
                end = TIMESTAMP_MILLIS()
                write_hbase_ok = True
                write_hbase_ms = end - start
                values.append(
                    Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'),
                          "hadoop.HBASE.write_time_ms", [], write_hbase_ms))
            except:
                LOGGER.error(traceback.format_exc())
                write_hbase_ok = False
                reason = ['Failed to insert row in HBase table']
            health_values.append(
                Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'),
                      "hadoop.HBASE.write_succeeded", reason, write_hbase_ok))

            #read some data from it
            if abort_test_sequence is True:
                return
            reason = []
            try:
                start = TIMESTAMP_MILLIS()
                row = table.row('row_key', columns=['cf:column'])
                end = TIMESTAMP_MILLIS()
                read_hbase_ms = end - start
                read_hbase_ok = row['cf:column'] == 'value'
                values.append(
                    Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'),
                          "hadoop.HBASE.read_time_ms", [], read_hbase_ms))
            except:
                LOGGER.error(traceback.format_exc())
                hbase_fix_output = subprocess.check_output([
                    'sudo', '-u', 'hbase', 'hbase', 'hbck', '-repair',
                    'blackbox_test_table'
                ])
                for line in hbase_fix_output.splitlines():
                    if 'Status:' in line or 'inconsistencies detected' in line:
                        LOGGER.debug(line)
                subprocess.check_output([
                    'sudo', '-u', 'hbase', 'hbase', 'zkcli', 'rmr',
                    '/hbase/table/blackbox_test_table'
                ])
                subprocess.check_output([
                    'sudo', '-u', 'hdfs', 'hadoop', 'fs', '-rm', '-r', '-f',
                    '-skipTrash', '/hbase/data/default/blackbox_test_table'
                ])
                read_hbase_ok = False
                reason = ['Failed to fetch row by row key from HBase']
            health_values.append(
                Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'),
                      "hadoop.HBASE.read_succeeded", reason, read_hbase_ok))

            #create some hive metadata
            reason = []
            if abort_test_sequence is True:
                return
            try:
                start = TIMESTAMP_MILLIS()
                hive = hive_api.connect(cdh.get_hive_endpoint())
                end = TIMESTAMP_MILLIS()
                hive.cursor().execute("DROP TABLE blackbox_test_table")
                connect_to_hive_ms = end - start
                connect_to_hive_ok = True
                values.append(
                    Event(TIMESTAMP_MILLIS(), cdh.get_name('HIVE'),
                          "hadoop.HIVE.connection_time_ms", [],
                          connect_to_hive_ms))
            except:
                LOGGER.error(traceback.format_exc())
                connect_to_hive_ok = False
                reason = ['Failed to connect to Hive Metastore']
            health_values.append(
                Event(TIMESTAMP_MILLIS(), cdh.get_name('HIVE'),
                      "hadoop.HIVE.connection_succeeded", reason,
                      connect_to_hive_ok))

            if abort_test_sequence is True:
                return
            reason = []
            try:
                start = TIMESTAMP_MILLIS()
                hive.cursor().execute((
                    "CREATE EXTERNAL TABLE "
                    "blackbox_test_table (key STRING, value STRING)"
                    "STORED BY \"org.apache.hadoop.hive.hbase.HBaseStorageHandler\" "
                    "WITH SERDEPROPERTIES "
                    "(\"hbase.columns.mapping\" = \":key,cf:column\") "
                    "TBLPROPERTIES(\"hbase.table.name\" = \"blackbox_test_table\")"
                ))
                end = TIMESTAMP_MILLIS()
                create_metadata_ms = end - start
                create_metadata_ok = True
                values.append(
                    Event(TIMESTAMP_MILLIS(), cdh.get_name('HIVE'),
                          "hadoop.HIVE.create_metadata_time_ms", [],
                          create_metadata_ms))
            except:
                LOGGER.error(traceback.format_exc())
                create_metadata_ok = False
                reason = [
                    'CREATE EXTERNAL TABLE statement failed on Hive Metastore'
                ]
            health_values.append(
                Event(TIMESTAMP_MILLIS(), cdh.get_name('HIVE'),
                      "hadoop.HIVE.create_metadata_succeeded", reason,
                      create_metadata_ok))

            #read some data via impala using it
            if abort_test_sequence is True:
                return

            if cdh.get_impala_endpoint() is not None:
                reason = []
                try:
                    start = TIMESTAMP_MILLIS()
                    impala = connect(host=cdh.get_impala_endpoint(),
                                     port=options.impalaport)
                    end = TIMESTAMP_MILLIS()
                    impala.cursor().execute("invalidate metadata")
                    connect_to_impala_ms = end - start
                    connect_to_impala_ok = True
                    values.append(
                        Event(TIMESTAMP_MILLIS(), cdh.get_name('IMPALA'),
                              "hadoop.IMPALA.connection_time_ms", [],
                              connect_to_impala_ms))
                except:
                    LOGGER.error(traceback.format_exc())
                    connect_to_impala_ok = False
                    reason = ['Failed to connect to Impala']
                health_values.append(
                    Event(TIMESTAMP_MILLIS(), cdh.get_name('IMPALA'),
                          "hadoop.IMPALA.connection_succeeded", reason,
                          connect_to_impala_ok))

                if abort_test_sequence is True:
                    return
                reason = []
                try:
                    start = TIMESTAMP_MILLIS()
                    impala_cursor = impala.cursor()
                    impala_cursor.execute("SELECT * FROM blackbox_test_table")
                    table_contents = impala_cursor.fetchall()
                    end = TIMESTAMP_MILLIS()
                    read_impala_ms = end - start
                    read_impala_ok = table_contents[0][1] == 'value'
                    values.append(
                        Event(TIMESTAMP_MILLIS(), cdh.get_name('IMPALA'),
                              "hadoop.IMPALA.read_time_ms", [],
                              read_impala_ms))
                except:
                    LOGGER.error(traceback.format_exc())
                    read_impala_ok = False
                    reason = ['Failed to SELECT from Impala']
                health_values.append(
                    Event(TIMESTAMP_MILLIS(), cdh.get_name('IMPALA'),
                          "hadoop.IMPALA.read_succeeded", reason,
                          read_impala_ok))
            else:
                reason = []
                try:
                    start = TIMESTAMP_MILLIS()
                    hive_cursor = hive.cursor()
                    hive_cursor.execute("SELECT * FROM blackbox_test_table")
                    table_contents = hive_cursor.fetchall()
                    end = TIMESTAMP_MILLIS()
                    read_hive_ms = end - start
                    read_hive_ok = table_contents[0][1] == 'value'
                    values.append(
                        Event(TIMESTAMP_MILLIS(), cdh.get_name('HQUERY'),
                              "hadoop.HQUERY.read_time_ms", [], read_hive_ms))
                except:
                    LOGGER.error(traceback.format_exc())
                    read_hive_ok = False
                    reason = ['Failed to SELECT from Hive']
                health_values.append(
                    Event(TIMESTAMP_MILLIS(), cdh.get_name('HQUERY'),
                          "hadoop.HQUERY.read_succeeded", reason,
                          read_hive_ok))

            #delete metadata
            if abort_test_sequence is True:
                return
            reason = []
            try:
                start = TIMESTAMP_MILLIS()
                hive.cursor().execute("DROP TABLE blackbox_test_table")
                end = TIMESTAMP_MILLIS()
                drop_metadata_ms = end - start
                drop_metadata_ok = True
                values.append(
                    Event(TIMESTAMP_MILLIS(), cdh.get_name('HIVE'),
                          "hadoop.HIVE.drop_table_time_ms", [],
                          drop_metadata_ms))
            except:
                LOGGER.error(traceback.format_exc())
                drop_metadata_ok = False
                reason = ['Failed to DROP table in Hive Metastore']
            health_values.append(
                Event(TIMESTAMP_MILLIS(), cdh.get_name('HIVE'),
                      "hadoop.HIVE.drop_table_succeeded", reason,
                      drop_metadata_ok))

            #delete hbase table
            if abort_test_sequence is True:
                return
            reason = []
            try:
                start = TIMESTAMP_MILLIS()
                # Disabled deleting table to work around apparent hbase bug (see VPP-17) but leaving
                # test step in so it can be easily re-enabled for testing.
                #hbase.disable_table('blackbox_test_table')
                #hbase.delete_table('blackbox_test_table')
                end = TIMESTAMP_MILLIS()
                drop_table_ms = end - start
                drop_table_ok = True
                values.append(
                    Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'),
                          "hadoop.HBASE.drop_table_time_ms", [],
                          drop_table_ms))
            except:
                LOGGER.error(traceback.format_exc())
                drop_table_ok = False
                reason = ['Failed to drop table in HBase']
            health_values.append(
                Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'),
                      "hadoop.HBASE.drop_table_succeeded", reason,
                      drop_table_ok))
示例#8
0
class TestCDHBlackboxPlugin(unittest.TestCase):
    '''
    Set of unit tests designed to validate cdh-blackbox Plugin
    '''
    @mock.patch('plugins.cdh_blackbox.TestbotPlugin.ApiResource')
    @mock.patch('plugins.cdh_blackbox.TestbotPlugin.happybase.Connection')
    @mock.patch('plugins.cdh_blackbox.TestbotPlugin.jPype')
    @mock.patch('plugins.cdh_blackbox.TestbotPlugin.jdbApi')
    @mock.patch('plugins.cdh_blackbox.TestbotPlugin.connect')
    @mock.patch('plugins.cdh_blackbox.TestbotPlugin.CDHData.get_name',
                lambda s, x: {'HBASE': 'hbase01', 'IMPALA': 'impala01', 'HIVE': 'hive01'}[x])
    @mock.patch('plugins.cdh_blackbox.TestbotPlugin.CDHData.get_type',
                lambda s, x: {'hbase01': 'HBASE', 'impala01': 'IMPALA', 'hive01': 'HIVE'}[x])
    @mock.patch('plugins.cdh_blackbox.TestbotPlugin.CDHData.get_hbase_endpoint',
                lambda s: '0.0.0.0')
    @mock.patch('plugins.cdh_blackbox.TestbotPlugin.CDHData.get_hive_endpoint',
                lambda s: '0.0.0.0')
    @mock.patch('plugins.cdh_blackbox.TestbotPlugin.CDHData.get_impala_endpoint',
                lambda s: '0.0.0.0')
    @mock.patch('plugins.cdh_blackbox.TestbotPlugin.CDHData.get_status_indicators',
                lambda s: [])
    def test_pass_simple(self, impala_connect_mock, hive_mock, jpype_mock, happybase_connection_mock, api_mock):
 
        # mock HBase connection.table.row
        _table = mock.MagicMock()
        _table.row.return_value = {'cf:column': 'value'}
        _hbase_conn = mock.MagicMock()
        _hbase_conn.table.return_value = _table
        happybase_connection_mock.return_value = _hbase_conn

        # mock Impala connection.cursor.fetchall
        _cursor = mock.MagicMock()
        _cursor.fetchall.return_value = [[None, 'value']]
        _impala_conn = mock.MagicMock()
        _impala_conn.cursor.return_value = _cursor
        impala_connect_mock.return_value = _impala_conn

        # mock JayDeBe connection and JPype
        _jpype = mock.MagicMock()
        _jpype.getDefaultJVMPath.return_value = None
        _jpype.startJVM.return_value = None
        jpype_mock.return_value = _jpype
        _hive_cursor = mock.MagicMock()
        _hive_cursor.fetchall.return_value = [[None, 'value']]
        _hive_conn = mock.MagicMock()
        _hive_conn.cursor.return_value = _hive_cursor
        hive_mock.return_value = _hive_conn

        plugin = CDHBlackboxPlugin()

        values = plugin.runner(("--cmhost 10.60.18.144 --cmhost 7777 "
                                "--cmuser user --cmpassword password --hadoopdistro CDH"), True)

        api_mock.assert_called_with(password='******', server_host='7777',
                                    server_port='7180', username='******', version=11)

        num_res = [('hbase01', 'hadoop.HBASE.create_table_time_ms', [], 5),
                   ('hbase01', 'hadoop.HBASE.write_time_ms', [], 7),
                   ('hbase01', 'hadoop.HBASE.read_time_ms', [], 1),
                   ('hive01', 'hadoop.HIVE.connection_time_ms', [], 7),
                   ('hive01', 'hadoop.HIVE.create_metadata_time_ms', [], 2),
                   ('impala01', 'hadoop.IMPALA.connection_time_ms', [], 0),
                   ('impala01', 'hadoop.IMPALA.read_time_ms', [], 4),
                   ('hive01', 'hadoop.HIVE.drop_table_time_ms', [], 3),
                   ('hbase01', 'hadoop.HBASE.drop_table_time_ms', [], 7)]
        gen_res = [('hbase01', 'hadoop.HBASE.create_table_succeeded', [], True),
                   ('hbase01', 'hadoop.HBASE.write_succeeded', [], True),
                   ('hbase01', 'hadoop.HBASE.read_succeeded', [], True),
                   ('hive01', 'hadoop.HIVE.connection_succeeded', [], True),
                   ('hive01', 'hadoop.HIVE.create_metadata_succeeded', [], True),
                   ('impala01', 'hadoop.IMPALA.connection_succeeded', [], True),
                   ('impala01', 'hadoop.IMPALA.read_succeeded', [], True),
                   ('hive01', 'hadoop.HIVE.drop_table_succeeded', [], True),
                   ('hbase01', 'hadoop.HBASE.drop_table_succeeded', [], True),
                   ('hbase01', 'hadoop.HBASE.health', [], 'OK'),
                   ('impala01', 'hadoop.IMPALA.health', [], 'OK'),
                   ('hive01', 'hadoop.HIVE.health', [], 'OK')
                  ]

        self.assertEqual(len(values), len(num_res) + len(gen_res))
        index = 0
        for check in num_res:
            self.assertEqual(check[0], values[index].source)
            self.assertEqual(check[1], values[index].metric)
            self.assertEqual(check[2], values[index].causes)
            self.assertTrue(isinstance(values[index].value, int) or
                            isinstance(values[index].value, long))
            index += 1

        for check in gen_res:
            self.assertEqual(check[0], values[index].source)
            self.assertEqual(check[1], values[index].metric)
            self.assertEqual(check[2], values[index].causes)
            self.assertEqual(check[3], values[index].value)
            index += 1

    @mock.patch('plugins.cdh_blackbox.TestbotPlugin.ApiResource')
    @mock.patch('plugins.cdh_blackbox.TestbotPlugin.happybase.Connection')
    @mock.patch('plugins.cdh_blackbox.TestbotPlugin.jPype')
    @mock.patch('plugins.cdh_blackbox.TestbotPlugin.jdbApi')
    @mock.patch('plugins.cdh_blackbox.TestbotPlugin.connect')
    @mock.patch('plugins.cdh_blackbox.TestbotPlugin.CDHData.get_name',
                lambda s, x: {'HBASE': 'hbase01', 'IMPALA': 'impala01',
                              'HIVE': 'hive01', 'HDFS': 'hdfs01'}[x])
    @mock.patch('plugins.cdh_blackbox.TestbotPlugin.CDHData.get_type',
                lambda s, x: {'hbase01': 'HBASE', 'impala01': 'IMPALA',
                              'hive01': 'HIVE', 'hdfs01': 'HDFS'}[x])
    @mock.patch('plugins.cdh_blackbox.TestbotPlugin.CDHData.get_hbase_endpoint',
                lambda s: '0.0.0.0')
    @mock.patch('plugins.cdh_blackbox.TestbotPlugin.CDHData.get_hive_endpoint',
                lambda s: '0.0.0.0')
    @mock.patch('plugins.cdh_blackbox.TestbotPlugin.CDHData.get_impala_endpoint',
                lambda s: '0.0.0.0')
    @mock.patch('plugins.cdh_blackbox.TestbotPlugin.CDHData.get_status_indicators',
                lambda s: [Event(0, 'hdfs01', 'hadoop.HDFS.cm_indicator', [], 'OK'),
                           Event(0, 'hbase01', 'hadoop.HBASE.cm_indicator', ['Cause A'], 'WARN'),
                           Event(0, 'hive01', 'hadoop.HIVE.cm_indicator', ['Cause B'], 'ERROR'),
                           Event(0, 'impala01', 'hadoop.IMPALA.cm_indicator',
                                 ['Cause C', 'Cause D'], 'WARN'),
                          ])
    def test_merge_simple(self, impala_connect_mock, hive_mock, jpype_mock, happybase_connection_mock, api_mock):

        # mock HBase connection.table.row with success
        _table = mock.MagicMock()
        _table.row.return_value = {'cf:column': 'value'}
        _hbase_conn = mock.MagicMock()
        _hbase_conn.table.return_value = _table
        happybase_connection_mock.return_value = _hbase_conn

        # mock Impala connection.cursor.fetchall with failure
        _cursor = mock.MagicMock()
        _cursor.fetchall.side_effect = Exception()
        _impala_conn = mock.MagicMock()
        _impala_conn.cursor.return_value = _cursor
        impala_connect_mock.return_value = _impala_conn

        # mock JayDeBe connection and JPype
        _jpype = mock.MagicMock()
        _jpype.getDefaultJVMPath.return_value = None
        _jpype.startJVM.return_value = None
        jpype_mock.return_value = _jpype
        _hive_cursor = mock.MagicMock()
        _hive_cursor.fetchall.return_value = [[None, 'value']]
        _hive_conn = mock.MagicMock()
        _hive_conn.cursor.return_value = _hive_cursor
        hive_mock.return_value = _hive_conn

        plugin = CDHBlackboxPlugin()

        values = plugin.runner(("--cmhost 10.60.18.144 --cmhost 7777 "
                                "--cmuser user --cmpassword password --hadoopdistro CDH"), True)

        api_mock.assert_called_with(password='******', server_host='7777',
                                    server_port='7180', username='******', version=11)

        for value in values:
            if value.metric == 'hadoop.HDFS.health':
                self.assertEqual(value.value, 'OK')
            if value.metric == 'hadoop.HBASE.health':
                self.assertEqual(value.value, 'WARN')
                self.assertEqual(value.causes, ['Cause A'])
            if value.metric == 'hadoop.HIVE.health':
                self.assertEqual(value.value, 'ERROR')
                self.assertEqual(value.causes, ['Cause B'])
            if value.metric == 'hadoop.IMPALA.health':
                self.assertEqual(value.value, 'ERROR')
                self.assertEqual(value.causes, ['Failed to SELECT from Impala',
                                                'Cause C', 'Cause D'])

    def test_cdhdata(self):
        '''
        Test that CDHData behaves as expected given a particular response from CM
        '''

        scheck = {'name': 'service check',
                  'explanation': 'service broken', 'summary': 'BAD'}
        rcheck = {'name': 'role check',
                  'explanation': 'role broken', 'summary': 'BAD'}
        hcheck = {'name': 'host check',
                  'explanation': 'host broken', 'summary': 'BAD'}

        role1 = mock.MagicMock(type='HBASETHRIFTSERVER', healthChecks=[rcheck],
                               hostRef=mock.MagicMock(hostId=42))
        role2 = mock.MagicMock(type='HIVESERVER2', healthChecks=[rcheck],
                               hostRef=mock.MagicMock(hostId=42))
        role3 = mock.MagicMock(type='IMPALAD', healthChecks=[rcheck],
                               hostRef=mock.MagicMock(hostId=42))

        service1 = mock.MagicMock(type='HBASE', healthSummary='CONCERNING', healthChecks=[scheck])
        service1.name = 'hbase01'
        service1.get_all_roles.return_value = [role1]
        service2 = mock.MagicMock(type='HIVE', healthSummary='WARN', healthChecks=[scheck])
        service2.name = 'hive01'
        service2.get_all_roles.return_value = [role2]
        service3 = mock.MagicMock(type='IMPALA', healthSummary='BAD', healthChecks=[scheck])
        service3.name = 'impala01'
        service3.get_all_roles.return_value = [role3]

        host = mock.MagicMock(hostname='hostA', healthChecks=[hcheck])

        api_mock = mock.Mock()
        cluster_mock = mock.Mock()
        cluster_mock.get_all_services = mock.Mock(return_value=[service1, service2, service3])
        api_mock.get_host = mock.Mock(return_value=host)

        cdhdata = CDHData(api_mock, cluster_mock)

        self.assertEqual(cdhdata.get_hive_endpoint(), 'hostA')
        self.assertEqual(cdhdata.get_hbase_endpoint(), 'hostA')
        self.assertEqual(cdhdata.get_impala_endpoint(), 'hostA')
        indicators = cdhdata.get_status_indicators()
        self.assertEqual(len(indicators), 3)
        for indicator in indicators:
            self.assertTrue(indicator.source in ['hive01', 'hbase01', 'impala01'])
            if indicator.source == 'hbase01':
                self.assertEqual(indicator.metric, 'hadoop.HBASE.cm_indicator')
                self.assertEqual(indicator.value, 'WARN')
            if indicator.source == 'hive01':
                self.assertEqual(indicator.metric, 'hadoop.HIVE.cm_indicator')
                self.assertEqual(indicator.value, 'WARN')
            if indicator.source == 'impala01':
                self.assertEqual(indicator.metric, 'hadoop.IMPALA.cm_indicator')
                self.assertEqual(indicator.value, 'ERROR')
    def analyse_results(self, zk_data, test_result):
        '''
        Analyse the partition summary and Prod2Cons
        Then set the the test result flag accordingly
        I the test flag is not green, put a reason explaining why
        Then return a json
        '''
        analyse_status = MonitorStatus["green"]
        analyse_causes = []
        analyse_metric = 'kafka.health'
        zk_majority = int(math.ceil(
            float(len(zk_data.list_zk.split(","))) / 2))

        if zk_data and zk_data.list_zk_ko:
            if zk_data.num_zk_ok >= zk_majority:
                LOGGER.warn(
                    "analyse_results : at least one zookeeper node failed")
                analyse_status = MonitorStatus["amber"]
                analyse_causes.append("zookeeper node(s) unreachable (%s)" %
                                      zk_data.list_zk_ko)
            else:
                LOGGER.error(
                    "analyse_results : at least one zookeeper node failed")
                analyse_status = MonitorStatus["red"]
                analyse_causes.append("zookeeper node(s) unreachable (%s)" %
                                      zk_data.list_zk_ko)

        if zk_data and zk_data.list_brokers_ko:
            LOGGER.error("analyse_results : at least one broker failed")
            analyse_status = MonitorStatus["red"]
            analyse_causes.append("broker(s) unreachable (%s)" %
                                  zk_data.list_brokers_ko)

        if zk_data and zk_data.num_part_ko > 0:
            LOGGER.error(
                "analyse_results : at least one topic / partition inconsistency"
            )
            if analyse_status != MonitorStatus["red"]:
                analyse_status = MonitorStatus["amber"]
            analyse_causes.append(
                "topic / partition inconsistency in zookeeper")

        if self.prod2cons:
            if test_result.sent == test_result.received \
             and test_result.notvalid == 0:
                LOGGER.debug(
                    "analyse_results - test for messages sent / received is valid"
                )
            else:
                LOGGER.error(
                    "analyse_results - test for messages sent / received failed"
                )
                analyse_status = MonitorStatus["red"]
                analyse_causes.append("producer / consumer failed " + \
                    "(sent %d, rcv_ok %d, rcv_ko %d)" %
                                      (test_result.sent,
                                       test_result.received,
                                       test_result.notvalid))

        # whitebox analysis
        if self.whitebox_error_code != -1:

            if self.whitebox_error_code == 101:
                LOGGER.warn("UnderReplicatedPartitions should be 0")
                if analyse_status != MonitorStatus["red"]:
                    analyse_status = MonitorStatus["amber"]
                analyse_causes.append("UnderReplicatedPartitions should be 0")
            elif self.whitebox_error_code == 102:
                LOGGER.warn(
                    "ActiveControllerCount only one broker in the cluster should have 1"
                )
                if analyse_status != MonitorStatus["red"]:
                    analyse_status = MonitorStatus["amber"]
                analyse_causes.append(
                    "ActiveControllerCount only one broker in the cluster should have 1"
                )
            elif self.whitebox_error_code == 104:
                LOGGER.warn(
                    "analyse_results : Unclean leader election rate, should be 0"
                )
                if analyse_status != MonitorStatus["red"]:
                    analyse_status = MonitorStatus["amber"]
                analyse_causes.append(
                    "Unclean leader election rate, should be 0")

        return Event(
            TIMESTAMP_MILLIS(), 'kafka', \
            analyse_metric, analyse_causes, analyse_status)
示例#10
0
    def process(self, zknodes, gbrokers, partitions):
        '''
        Returns a named tuple of type PartitionsSummary.
        '''
        LOGGER.debug("process started")
        topic_ok = 0
        topic_ko = 0
        process_results = []
        for obj in partitions:
            parts_object = obj.partitions["list"]
            if obj.partitions["valid"] is True:
                for parts in parts_object:
                    # Get the partition leader
                    for part, partinfo in parts.iteritems():
                        leader_read = partinfo['leader']
                        broker = get_broker_by_id(gbrokers, '%d' % leader_read)

                        if broker is not None:
                            process_results.append(
                                PartitionState(broker.host, broker.port,
                                               obj.id, part,
                                               obj.partitions["valid"]))
                    topic_ok += 1
            else:
                topic_ko += 1
                LOGGER.error("Topic not in a good state (%s)", obj.id)
                process_results.append(
                    PartitionState(None, None, obj.id, None,
                                   obj.partitions["valid"]))

        self.results.append(
            Event(TIMESTAMP_MILLIS(), 'kafka', 'kafka.nodes', [],
                  gbrokers.connect))

        self.results.append(
            Event(TIMESTAMP_MILLIS(), 'kafka', 'kafka.nodes.ok', [],
                  gbrokers.num_ok))

        self.results.append(
            Event(TIMESTAMP_MILLIS(), 'kafka', 'kafka.nodes.ko', [],
                  gbrokers.num_ko))

        self.results.append(
            Event(TIMESTAMP_MILLIS(), 'kafka', 'kafka.partitions.ok', [],
                  topic_ok))

        self.results.append(
            Event(TIMESTAMP_MILLIS(), 'kafka', 'kafka.partitions.ko', [],
                  topic_ko))

        LOGGER.debug("process finished")
        return MonitorSummary(num_partitions=len(process_results),
                              list_brokers=gbrokers.connect,
                              list_brokers_ko=gbrokers.error,
                              num_brokers_ok=gbrokers.num_ok,
                              num_brokers_ko=gbrokers.num_ko,
                              list_zk=zknodes.connect,
                              list_zk_ko=zknodes.error,
                              num_zk_ok=zknodes.num_ok,
                              num_zk_ko=zknodes.num_ko,
                              num_part_ok=topic_ok,
                              num_part_ko=topic_ko,
                              partitions=tuple(process_results))
示例#11
0
    def update(self):
        '''
        Retrieve endpoint metadata & overall health indicators from Ambari plus any reason codes

        Returns sequence of Event tuples with metrics taking the form of hadoop.%s.cm_indicator
        '''
        self._values = []
        self._metadata = {'names': {}, 'types': {}}

        def get_health_state(alert_state):
            '''
            Convert alert state to health state
            '''
            if alert_state == 'CRITICAL':
                return "ERROR"
            elif alert_state == 'WARNING':
                return "WARN"
            return "OK"

        # get cluster name
        cluster_uri = requests.get(
            '%s/clusters' % self._ambari_api,
            auth=self._http_auth,
            headers=self._http_headers).json()['items'][0]['href']

        # get all alerts and aggregate a health summary from the alert list
        alerts = requests.get(
            '%s/alerts?fields=Alert/component_name,Alert/text,Alert/'
            'label,Alert/state&Alert/maintenance_state.in(OFF)' % cluster_uri,
            auth=self._http_auth,
            headers=self._http_headers).json()['items']
        self._metadata['names']['HQUERY'] = 'HQUERY'
        self._metadata['types']['HQUERY'] = 'HQUERY'
        service_health_store = {}
        service_health_causes = {}
        for alert_item in alerts:
            alert_info = alert_item['Alert']
            service_name = alert_info['service_name']
            if service_name == 'SPARK':
                service_type = 'SPARK_ON_YARN'
            elif service_name == 'AMBARI':
                service_type = 'CLUSTER_MANAGER'
            elif service_name == 'AMBARI_METRICS':
                service_name = 'AMBARI'
                service_type = 'CLUSTER_MANAGER'
            else:
                service_type = service_name

            self._metadata['names'][service_type] = service_name
            self._metadata['types'][service_name] = service_type

            current_health = service_health_store[
                service_name] if service_name in service_health_store else 'OK'
            new_health = get_health_state(alert_info['state'])
            updated_health = self._update_health(current_health, new_health)
            service_health_store[service_name] = updated_health
            if new_health in ['ERROR', 'WARN']:
                current_causes = service_health_causes[service_name] if service_name in service_health_causes else []
                current_causes.append(
                    '%s: %s - %s' % (alert_info['host_name'],
                                     alert_info['label'], alert_info['text']))
                service_health_causes[service_name] = current_causes

        # Write out an event for each service
        for service_name in service_health_store:
            self._values.append(
                Event(
                    TIMESTAMP_MILLIS(), service_name,
                    "hadoop.%s.cm_indicator" %
                    self._metadata['types'][service_name],
                    list(
                        set(service_health_causes[service_name]
                            if service_name in service_health_causes else [])),
                    service_health_store[service_name]))

        # Grab endpoints used by other tests
        query = 'fields=host_components'
        self._metadata['hbase_endpoint'] = requests.get(
            '%s/services/HBASE/components/HBASE_MASTER?%s' %
            (cluster_uri, query),
            auth=self._http_auth,
            headers=self._http_headers).json(
            )['host_components'][0]['HostRoles']['host_name']
        self._metadata['hive_endpoint'] = requests.get(
            '%s/services/HIVE/components/HIVE_SERVER?%s' %
            (cluster_uri, query),
            auth=self._http_auth,
            headers=self._http_headers).json(
            )['host_components'][0]['HostRoles']['host_name']
        self._metadata['impala_endpoint'] = None
示例#12
0
    def runner(self, args, display=True):
        '''
        Main section.
        '''
        plugin_args = args.split() \
        if args is not None and (len(args.strip()) > 0) \
        else ""

        options = self.read_args(plugin_args)

        values = []

        try:
            start = TIMESTAMP_MILLIS()
            with eventlet.Timeout(100):
                req = requests.get("%s/repository/packages" % (options.dmendpoint), timeout=20)
            end = TIMESTAMP_MILLIS()
            packages_available_ok = True
            packages_available_count = len(req.json())
            packages_available_ms = end-start
            values.append(Event(TIMESTAMP_MILLIS(), "deployment-manager", \
                "deployment-manager.packages_available_time_ms", \
                [], packages_available_ms))
            values.append(Event(TIMESTAMP_MILLIS(), "deployment-manager", \
                "deployment-manager.packages_available_count", \
                [], packages_available_count))
        except Exception:
            packages_available_ok = False
        values.append(Event(TIMESTAMP_MILLIS(), "deployment-manager", \
            "deployment-manager.packages_available_succeeded", \
            [], packages_available_ok))

        try:
            start = TIMESTAMP_MILLIS()
            with eventlet.Timeout(100):
                req = requests.get("%s/packages" \
                    % (options.dmendpoint), timeout=20)
            end = TIMESTAMP_MILLIS()
            packages_deployed_ok = True
            packages_deployed_count = len(req.json())
            packages_deployed_ms = end-start
            values.append(Event(TIMESTAMP_MILLIS(), "deployment-manager", \
                "deployment-manager.packages_deployed_time_ms", \
                [], packages_deployed_ms))
            values.append(Event(TIMESTAMP_MILLIS(), 'deployment-manager', \
                "deployment-manager.packages_deployed_count", \
                [], packages_deployed_count))
        except Exception:
            packages_deployed_ok = False
        values.append(Event(TIMESTAMP_MILLIS(), 'deployment-manager', \
            "deployment-manager.packages_deployed_succeeded", \
            [], packages_deployed_ok))
        cause = ""
        health = "OK"
        if not packages_available_ok or not packages_deployed_ok:
            health = "ERROR"
            cause = "Deployment manager package APIs are not working"
        values.append(Event(TIMESTAMP_MILLIS(), 'deployment-manager',
                            'deployment-manager.health', [cause], health))
        if display:
            self._do_display(values)
        return values
示例#13
0
    def update(self):
        '''
        Retrieve endpoint metadata & overall health indicators from CM plus any reason codes

        Returns sequence of Event tuples with metrics taking the form of hadoop.%s.cm_indicator
        '''
        self._values = []
        self._metadata = {'names': {}, 'types': {}}

        def is_bad(summary):
            '''
            Designated 'bad' status results
            '''
            return summary in ["BAD", "CONCERNING", "ERROR", "WARN"]

        def get_causes(health_checks):
            '''
            Extract causes from health check results
            '''
            return [
                "%s%s" %
                (chk['name'], ":" +
                 chk['explanation'] if 'explanation' in chk.keys() else '')
                for chk in health_checks if is_bad(chk['summary'])
            ]

        def update_health(current, updated):
            '''
            Given current health and and an update return new current health
            '''
            updated_health = current

            if current != 'ERROR' and (updated == 'CONCERNING'
                                       or updated == 'WARN'):
                updated_health = 'WARN'
            elif updated == 'BAD' or updated == 'ERROR':
                updated_health = 'ERROR'

            return updated_health

        # Main body of function - single pass over all services picking up endpoints,
        # health of each service and causes in the case of poor health

        for service in self._cluster.get_all_services():

            self._metadata['names'][service.type] = service.name
            self._metadata['types'][service.name] = service.type

            service_health = update_health('OK', service.healthSummary)
            causes = get_causes(service.healthChecks)

            for role in service.get_all_roles():

                if role.type == "HBASETHRIFTSERVER":
                    self._metadata['hbase_endpoint'] = \
                        self._api.get_host(role.hostRef.hostId).hostname
                if role.type == "HIVESERVER2":
                    self._metadata['hive_endpoint'] = \
                        self._api.get_host(role.hostRef.hostId).hostname
                if role.type == "IMPALAD":
                    self._metadata['impala_endpoint'] = \
                        self._api.get_host(role.hostRef.hostId).hostname

                host = self._api.get_host(role.hostRef.hostId)
                causes.extend(
                    get_causes(self._api.get_host(host.hostId).healthChecks))
                causes.extend(get_causes(role.healthChecks))

            self._values.append(
                Event(TIMESTAMP_MILLIS(),
                      service.name, "hadoop.%s.cm_indicator" % service.type,
                      list(set(causes)), service_health))
示例#14
0
    def runner(self, args, display=True):
        '''
        Main section.
        '''
        plugin_args = args.split() \
        if args is not None and args.strip() \
        else ""

        options = self.read_args(plugin_args)
        cause = []
        values = []

        packages_available_ok, packages_deployed_ok = False, False
        packages_available_count, packages_deployed_count = -1, -1
        packages_available_ms, packages_deployed_ms = -1, -1

        # noinspection PyBroadException
        try:
            path = '/repository/packages'
            start = TIMESTAMP_MILLIS()
            with eventlet.Timeout(100):
                req = requests.get("%s%s" % (options.dmendpoint, path),
                                   timeout=20)
            end = TIMESTAMP_MILLIS()

            packages_available_ms = end - start
            status, msg = DMBlackBox.validate_api_response(req, path)

            if status == 'SUCCESS':
                packages_available_ok = True
                packages_available_count = len(req.json())
            else:
                cause.append(msg)

        except RequestException:
            cause.append(
                'Unable to connect to the Deployment Manager (request path = {})'
                .format(path))

        except Exception as e:
            cause.append('Platform Testing Client Error- ' + str(e))

        # noinspection PyBroadException
        try:
            path = '/packages'
            start = TIMESTAMP_MILLIS()
            with eventlet.Timeout(100):
                req = requests.get("%s%s" % (options.dmendpoint, path),
                                   timeout=20)
            end = TIMESTAMP_MILLIS()

            packages_deployed_ms = end - start
            status, msg = DMBlackBox.validate_api_response(req, path)

            if status == 'SUCCESS':
                packages_deployed_ok = True
                packages_deployed_count = len(req.json())
            else:
                cause.append(msg)

        except RequestException:
            cause.append(
                'Unable to connect to the Deployment Manager (request path = {})'
                .format(path))

        except Exception as e:
            cause.append('Platform Testing Client Error- ' + str(e))

        values.append(
            Event(TIMESTAMP_MILLIS(), "deployment-manager",
                  "deployment-manager.packages_available_time_ms", [],
                  packages_available_ms))

        values.append(
            Event(TIMESTAMP_MILLIS(), "deployment-manager",
                  "deployment-manager.packages_available_succeeded", [],
                  packages_available_ok))

        values.append(
            Event(TIMESTAMP_MILLIS(), "deployment-manager",
                  "deployment-manager.packages_available_count", [],
                  packages_available_count))

        values.append(
            Event(TIMESTAMP_MILLIS(), "deployment-manager",
                  "deployment-manager.packages_deployed_time_ms", [],
                  packages_deployed_ms))

        values.append(
            Event(TIMESTAMP_MILLIS(), 'deployment-manager',
                  "deployment-manager.packages_deployed_succeeded", [],
                  packages_deployed_ok))

        values.append(
            Event(TIMESTAMP_MILLIS(), 'deployment-manager',
                  "deployment-manager.packages_deployed_count", [],
                  packages_deployed_count))

        health = "OK"
        if not packages_available_ok or not packages_deployed_ok:
            health = "ERROR"

        values.append(
            Event(TIMESTAMP_MILLIS(), 'deployment-manager',
                  'deployment-manager.health', cause, health))
        if display:
            self._do_display(values)
        return values
示例#15
0
    def runner(self, args, display=True):
        """
        Main section.
        """
        plugin_args = args.split() \
        if args is not None and args.strip() \
        else ""

        options = self.read_args(plugin_args)
        cause = []
        values = []

        hs_available_success, hs_completed_jobs_success = False, False
        hs_available_ms, hs_completed_jobs_ms = -1, -1
        installed_flink_version, completed_job_count = '', -1

        # noinspection PyBroadException
        try:
            path = '/config'
            start = TIMESTAMP_MILLIS()
            with eventlet.Timeout(100):
                req = requests.get("%s%s" % (options.fhendpoint, path),
                                   timeout=20)
            end = TIMESTAMP_MILLIS()

            hs_available_ms = end - start
            status, msg = Flink.validate_api_response(req, path)

            if status == 'SUCCESS':
                installed_flink_version = json.loads(req.text).get(
                    "flink-version", '')
                hs_available_success = True
            else:
                cause.append(msg)

        except RequestException:
            cause.append(
                'Unable to connect to the Flink History Server (request path = {})'
                .format(path))

        except Exception as except_obj:
            cause.append('Platform Testing Client Error- ' + str(except_obj))

        # noinspection PyBroadException
        try:
            path = '/joboverview'
            start = TIMESTAMP_MILLIS()
            with eventlet.Timeout(100):
                req = requests.get("%s%s" % (options.fhendpoint, path),
                                   timeout=20)
            end = TIMESTAMP_MILLIS()

            hs_completed_jobs_ms = end - start

            # 404 - added to the expected response codes because,
            # Flink history server return 404, unless at least one flink job is executed.
            status, msg = Flink.validate_api_response(req, path, [404])

            if status == 'SUCCESS':
                if req.status_code == 200:
                    completed_job_count = len(
                        json.loads(req.text).get('finished'))
                elif req.status_code == 404:
                    completed_job_count = 0
                hs_completed_jobs_success = True
            else:
                cause.append(msg)

        except RequestException:
            cause.append(
                'Unable to connect to the Flink History Server (request path = {})'
                .format(path))

        except Exception as except_obj:
            cause.append('Platform Testing Client Error- ' + str(except_obj))

        values.append(
            Event(TIMESTAMP_MILLIS(), "flink",
                  "flink.history_server_available_success", [],
                  hs_available_success))

        values.append(
            Event(TIMESTAMP_MILLIS(), "flink", "flink.installed_flink_version",
                  [], installed_flink_version))

        values.append(
            Event(TIMESTAMP_MILLIS(), "flink",
                  "flink.history_server_available_ms", [], hs_available_ms))

        values.append(
            Event(TIMESTAMP_MILLIS(), "flink",
                  "flink.history_server_completed_jobs_success", [],
                  hs_completed_jobs_success))

        values.append(
            Event(TIMESTAMP_MILLIS(), "flink",
                  "flink.history_server_completed_jobs_count", [],
                  completed_job_count))

        values.append(
            Event(TIMESTAMP_MILLIS(), "flink",
                  "flink.history_server_completed_jobs_ms", [],
                  hs_completed_jobs_ms))

        health = "OK"
        if not hs_available_success or not hs_completed_jobs_success:
            health = "ERROR"

        values.append(
            Event(TIMESTAMP_MILLIS(), 'flink', 'flink.health', cause, health))

        if display:
            self._do_display(values)
        return values
示例#16
0
                                       .rstrip('\r\n')
                    if zkelect == "leader" or zkelect == "standalone":
                        zk_election = True
                    self.results.append(
                        Event(TIMESTAMP_MILLIS(), 'zookeeper',
                              'zookeeper.%d.mode' % (zid), [], zkelect))
                except ZkError, ex:
                    LOGGER.error('Failed to access Zookeeper: %s', str(ex))
                    break
                except ProcessorError, ex:
                    LOGGER.error('Failed to process: %s', str(ex))
                    break
            else:
                self.results.append(
                    Event(TIMESTAMP_MILLIS(), 'zookeeper',
                          'zookeeper.%d.mode' % (zid), [],
                          MonitorStatus["red"]))
            zid += 1
        if not zk_data:
            zk_data = ZkMonitorSummary(list_zk=self.zconnect,
                                       list_zk_ko=self.zconnect,
                                       num_zk_ok=0,
                                       num_zk_ko=len(zknodes))

        # ----------------------------------------
        # Lets'build the global result structure
        # ----------------------------------------
        results_summary = analyse_results(zk_data, zk_election)
        # ----------------------------------------
        # if output display is required
        # ----------------------------------------