def test_publish_subscribe(self):
        event1 = ClusterHealthValidatorEvent.NodeStatus()
        event2 = ClusterHealthValidatorEvent.NodePeersNulls()

        # Put events to the publish queue.
        self.events_device.publish_event(event1)
        self.events_device.publish_event(event2)

        stop_event = threading.Event()
        counter = multiprocessing.Value(ctypes.c_uint32, 0)

        threading.Timer(
            interval=1,
            function=stop_event.set).start()  # stop subscriber in 1 second.
        self.events_device.start_delay = 0.5
        self.events_device.start()

        try:
            events_generator = self.events_device.outbound_events(
                stop_event=stop_event, events_counter=counter)

            event1_class, event1_received = next(events_generator)
            self.assertEqual(event1_class, "ClusterHealthValidatorEvent")
            self.assertEqual(event1_received, event1)

            event2_class, event2_received = next(events_generator)
            self.assertEqual(event2_class, "ClusterHealthValidatorEvent")
            self.assertEqual(event2_received, event2)

            self.assertRaises(StopIteration, next, events_generator)
        finally:
            self.events_device.stop(timeout=1)

        self.assertEqual(self.events_device.events_counter, counter.value)
        self.assertEqual(counter.value, 2)
def check_node_status_in_gossip_and_nodetool_status(
        gossip_info, nodes_status, current_node) -> HealthEventsGenerator:
    if not nodes_status:
        LOGGER.warning(
            "Node status info is not available. Search for the warning above. "
            "Verify node status can't be performed")
        return

    for node, node_info in gossip_info.items():
        is_target = current_node.print_node_running_nemesis(node)
        if node not in nodes_status:
            if node_info[
                    'status'] not in current_node.GOSSIP_STATUSES_FILTER_OUT:
                LOGGER.debug("Gossip info: %s\nnodetool.status info: %s",
                             gossip_info, nodes_status)
                yield ClusterHealthValidatorEvent.NodeStatus(
                    severity=Severity.ERROR,
                    node=current_node.name,
                    error=
                    f"Current node {current_node}. The node {node}{is_target} "
                    f"exists in the gossip but doesn't exist in the nodetool.status",
                )
            continue

        if (node_info['status'] == 'NORMAL' and nodes_status[node]['status'] != 'UN') or \
                (node_info['status'] != 'NORMAL' and nodes_status[node]['status'] == 'UN'):
            LOGGER.debug("Gossip info: %s\nnodetool.status info: %s",
                         gossip_info, nodes_status)
            yield ClusterHealthValidatorEvent.NodeStatus(
                severity=Severity.ERROR,
                node=current_node.name,
                error=f"Current node {current_node}. Wrong node status. "
                f"Node {node}{is_target} status in nodetool.status is "
                f"{nodes_status[node]['status']}, but status in gossip {node_info['status']}",
            )

    # Validate that all nodes in nodetool.status exist in gossip
    not_in_gossip = list(set(nodes_status.keys()) - set(gossip_info.keys()))
    for node in not_in_gossip:
        if nodes_status[node]['status'] == 'UN':
            is_target = current_node.print_node_running_nemesis(
                node.ip_address)
            LOGGER.debug("Gossip info: %s\nnodetool.status info: %s",
                         gossip_info, nodes_status)
            yield ClusterHealthValidatorEvent.NodeSchemaVersion(
                severity=Severity.ERROR,
                node=current_node.name,
                error=f"Current node {current_node}. "
                f"Node {node}{is_target} exists in the nodetool.status but missed in gossip.",
            )
def check_nulls_in_peers(gossip_info, peers_details, current_node) -> HealthEventsGenerator:
    """
    This validation is added to recreate the issue: https://github.com/scylladb/scylla/issues/4652
    Found scenario described in the https://github.com/scylladb/scylla/issues/6397
    """
    if not gossip_info:
        LOGGER.warning("Gossip info is not available. Search for the warning above")
        return

    for ip, node_info in peers_details.items():
        if all(value != "null" for value in node_info.values()):
            continue

        is_target = current_node.print_node_running_nemesis(ip)
        message = f"Current node {current_node.ip_address}. Found nulls in system.peers for " \
                  f"node {ip}{is_target} with status {gossip_info.get(ip, {}).get('status', 'n/a')} : " \
                  f"{peers_details[ip]}"

        # By Asias request: https://github.com/scylladb/scylla/issues/6397#issuecomment-666893877
        LOGGER.debug("Print all columns from system.peers for peer %s", ip)
        current_node.run_cqlsh(f"select * from system.peers where peer = '{ip}'", split=True, verbose=True)

        if ip in gossip_info and gossip_info[ip]['status'] not in current_node.GOSSIP_STATUSES_FILTER_OUT:
            yield ClusterHealthValidatorEvent.NodePeersNulls(
                severity=Severity.ERROR,
                node=current_node.name,
                error=message,
            )
        else:
            # Issue https://github.com/scylladb/scylla/issues/6397 - Should the info about decommissioned node
            # be kept in the system.peers?
            LOGGER.warning(message)
Пример #4
0
def check_nodes_status(nodes_status: dict,
                       current_node,
                       removed_nodes_list=None) -> HealthEventsGenerator:
    node_type = 'target' if current_node.running_nemesis else 'regular'
    if not nodes_status:
        LOGGER.warning(
            "Node status info is not available. Search for the warning above")
        return

    LOGGER.info("Status for %s node %s", node_type, current_node.name)

    if removed_nodes_list is None:
        removed_nodes_list = ()

    for node_ip, node_properties in nodes_status.items():
        if node_properties['status'] != "UN":
            is_target = current_node.print_node_running_nemesis(node_ip)

            # FIXME: #2383 must be reverted once scylladb/scylla-enterprise#1419 will be fixed.
            LOGGER.debug("REMOVED NODES LIST = %s", removed_nodes_list)

            yield ClusterHealthValidatorEvent.NodeStatus(
                severity=Severity.ERROR
                if node_ip in removed_nodes_list else Severity.CRITICAL,
                node=current_node.name,
                error=f"Current node {current_node.ip_address}. "
                f"Node with {node_ip}{is_target} status is {node_properties['status']}",
            )
Пример #5
0
    def test_cluster_health_validator_event_msgfmt(self):
        critical_event = ClusterHealthValidatorEvent.NodeStatus(
            severity=Severity.CRITICAL, node="n1", error="e1")
        critical_event.event_id = "712128d0-4837-4213-8a60-d6e2ec106c52"
        self.assertEqual(
            str(critical_event),
            "(ClusterHealthValidatorEvent Severity.CRITICAL) period_type=one-time "
            "event_id=712128d0-4837-4213-8a60-d6e2ec106c52: type=NodeStatus node=n1 error=e1"
        )
        self.assertEqual(critical_event,
                         pickle.loads(pickle.dumps(critical_event)))

        error_event = ClusterHealthValidatorEvent.NodePeersNulls(
            severity=Severity.ERROR, node="n2", error="e2")
        error_event.event_id = "712128d0-4837-4213-8a60-d6e2ec106c52"
        self.assertEqual(
            str(error_event),
            "(ClusterHealthValidatorEvent Severity.ERROR) period_type=one-time "
            "event_id=712128d0-4837-4213-8a60-d6e2ec106c52: type=NodePeersNulls node=n2 error=e2"
        )
        self.assertEqual(error_event, pickle.loads(pickle.dumps(error_event)))

        warning_event = ClusterHealthValidatorEvent.NodeSchemaVersion(
            severity=Severity.WARNING, node="n3", message="m3")
        warning_event.event_id = "712128d0-4837-4213-8a60-d6e2ec106c52"
        self.assertEqual(
            str(warning_event),
            "(ClusterHealthValidatorEvent Severity.WARNING) period_type=one-time "
            "event_id=712128d0-4837-4213-8a60-d6e2ec106c52: type=NodeSchemaVersion node=n3 message=m3"
        )
        self.assertEqual(warning_event,
                         pickle.loads(pickle.dumps(warning_event)))

        info_event = ClusterHealthValidatorEvent.NodesNemesis(
            severity=Severity.WARNING, node="n4", message="m4")
        info_event.event_id = "712128d0-4837-4213-8a60-d6e2ec106c52"
        self.assertEqual(
            str(info_event),
            "(ClusterHealthValidatorEvent Severity.WARNING) period_type=one-time "
            "event_id=712128d0-4837-4213-8a60-d6e2ec106c52: type=NodesNemesis node=n4 message=m4"
        )
        self.assertEqual(info_event, pickle.loads(pickle.dumps(info_event)))

        info_event = ClusterHealthValidatorEvent.Info(node="n4", message="m4")
        info_event.event_id = "712128d0-4837-4213-8a60-d6e2ec106c52"
        self.assertEqual(
            str(info_event),
            "(ClusterHealthValidatorEvent Severity.NORMAL) period_type=one-time "
            "event_id=712128d0-4837-4213-8a60-d6e2ec106c52: type=Info node=n4 message=m4"
        )
        self.assertEqual(info_event, pickle.loads(pickle.dumps(info_event)))

        info_event = ClusterHealthValidatorEvent.Done(node="n4", message="m4")
        info_event.event_id = "712128d0-4837-4213-8a60-d6e2ec106c52"
        self.assertEqual(
            str(info_event),
            "(ClusterHealthValidatorEvent Severity.NORMAL) period_type=one-time "
            "event_id=712128d0-4837-4213-8a60-d6e2ec106c52: type=Done node=n4 message=m4"
        )
        self.assertEqual(info_event, pickle.loads(pickle.dumps(info_event)))
Пример #6
0
 def test():
     try:
         ClusterHealthValidatorEvent.NodeStatus(
             node='node-1',
             message='Failed by some reason',
             error='Reason to fail',
             severity=Severity.CRITICAL,
         ).publish()
         end_time = time.time() + 2
         while time.time() < end_time:
             time.sleep(0.1)
     except Exception:  # pylint: disable=broad-except
         pass
    def test_grafana(self):
        start_grafana_pipeline(_registry=self.events_processes_registry)
        grafana_annotator = get_events_process(EVENTS_GRAFANA_ANNOTATOR_ID, _registry=self.events_processes_registry)
        grafana_aggregator = get_events_process(EVENTS_GRAFANA_AGGREGATOR_ID, _registry=self.events_processes_registry)
        grafana_postman = get_grafana_postman(_registry=self.events_processes_registry)

        time.sleep(EVENTS_SUBSCRIBERS_START_DELAY)

        try:
            self.assertIsInstance(grafana_annotator, GrafanaAnnotator)
            self.assertTrue(grafana_annotator.is_alive())
            self.assertEqual(grafana_annotator._registry, self.events_main_device._registry)
            self.assertEqual(grafana_annotator._registry, self.events_processes_registry)

            self.assertIsInstance(grafana_aggregator, GrafanaEventAggregator)
            self.assertTrue(grafana_aggregator.is_alive())
            self.assertEqual(grafana_aggregator._registry, self.events_main_device._registry)
            self.assertEqual(grafana_aggregator._registry, self.events_processes_registry)

            self.assertIsInstance(grafana_postman, GrafanaEventPostman)
            self.assertTrue(grafana_postman.is_alive())
            self.assertEqual(grafana_postman._registry, self.events_main_device._registry)
            self.assertEqual(grafana_postman._registry, self.events_processes_registry)

            grafana_aggregator.time_window = 1

            set_grafana_url("http://localhost", _registry=self.events_processes_registry)

            with unittest.mock.patch("requests.post") as mock:
                for runs in range(1, 4):
                    with self.wait_for_n_events(grafana_annotator, count=10, timeout=1):
                        for _ in range(10):
                            self.events_main_device.publish_event(
                                ClusterHealthValidatorEvent.NodeStatus(severity=Severity.NORMAL))
                    time.sleep(1)

                self.assertEqual(mock.call_count, runs * 5)
                self.assertEqual(
                    mock.call_args.kwargs["json"]["tags"],
                    ["ClusterHealthValidatorEvent", "NORMAL", "events", "NodeStatus"],
                )

            self.assertEqual(self.events_main_device.events_counter, grafana_annotator.events_counter)
            self.assertEqual(grafana_annotator.events_counter, grafana_aggregator.events_counter)
            self.assertLessEqual(grafana_postman.events_counter, grafana_aggregator.events_counter)
        finally:
            grafana_annotator.stop(timeout=1)
            grafana_aggregator.stop(timeout=1)
            grafana_postman.stop(timeout=1)
Пример #8
0
def check_nodes_status(
    nodes_status: dict, current_node,
    removed_nodes_list=()) -> HealthEventsGenerator:
    node_type = 'target' if current_node.running_nemesis else 'regular'
    if not nodes_status:
        LOGGER.warning(
            "Node status info is not available. Search for the warning above")
        return
    LOGGER.info("Status for %s node %s", node_type, current_node.name)
    for node_ip, node_properties in nodes_status.items():
        if node_properties['status'] != "UN":
            LOGGER.info(
                "All nodes that have been removed up until this point: %s",
                str(removed_nodes_list))
            is_target = current_node.print_node_running_nemesis(node_ip)
            yield ClusterHealthValidatorEvent.NodeStatus(
                severity=Severity.CRITICAL,
                node=current_node.name,
                error=f"Current node {current_node.ip_address}. "
                f"Node with {node_ip}{is_target} status is {node_properties['status']}",
            )
def check_schema_version(gossip_info, peers_details, nodes_status,
                         current_node) -> HealthEventsGenerator:
    if nodes_status and len(nodes_status.keys()) == 1:
        LOGGER.debug('There is one node only in the cluster. No peers data. '
                     'Verify schema version can\'t be performed')
        return

    if not peers_details:
        LOGGER.warning(
            "SYSTEM.PEERS info is not availble. Search for the warning above. "
            "Verify schema version can\'t be performed")
        return

    if not gossip_info:
        LOGGER.warning(
            "Gossip info is not availble. Search for the warning above. "
            "Verify schema version can\'t be performed")
        return

    debug_message = f"Gossip info: {gossip_info}\nSYSTEM.PEERS info: {peers_details}"
    # Validate schema version
    for node, node_info in gossip_info.items():
        # SYSTEM.PEERS table includes peers of the current node, so the node itcurrent_node doesn't exist in the list
        if current_node == node:
            continue

        # Can't validate the schema version if the node is not in NORMAL status
        if node_info['status'] != 'NORMAL':
            continue

        is_target = current_node.print_node_running_nemesis(node.ip_address)
        if node not in peers_details.keys():
            LOGGER.debug(debug_message)
            yield ClusterHealthValidatorEvent.NodeSchemaVersion(
                severity=Severity.ERROR,
                node=current_node.name,
                error=f"Current node {current_node}. "
                f"Node {node}{is_target} exists in the gossip but missed in SYSTEM.PEERS.",
            )
            continue

        if node_info['schema'] != peers_details[node]['schema_version']:
            LOGGER.debug(debug_message)
            yield ClusterHealthValidatorEvent.NodeSchemaVersion(
                severity=Severity.ERROR,
                node=current_node.name,
                error=f"Current node {current_node}. Wrong Schema version. "
                f"Node {node}{is_target} schema version in SYSTEM.PEERS is "
                f"{peers_details[node]['schema_version']}, "
                f"but schema version in gossip {node_info['schema']}",
            )

    # Validate that all nodes in SYSTEM.PEERS exist in gossip
    not_in_gossip = list(set(peers_details.keys()) - set(gossip_info.keys()))
    if not_in_gossip:
        LOGGER.debug(debug_message)
        yield ClusterHealthValidatorEvent.NodeSchemaVersion(
            severity=Severity.ERROR,
            node=current_node.name,
            error=
            f"Current node {current_node}. Nodes {','.join(node.ip_address for node in not_in_gossip)}"
            f" exists in the SYSTEM.PEERS but missed in gossip.",
        )

    # Validate that same schema on all nodes in the gossip
    schema_version_on_all_nodes = [
        values['schema'] for values in gossip_info.values()
        if values['status'] not in current_node.GOSSIP_STATUSES_FILTER_OUT
    ]

    if len(set(schema_version_on_all_nodes)) > 1:
        LOGGER.debug(debug_message)
        gossip_info_str = '\n'.join(
            f"{node}: {schema_version['schema']}"
            for node, schema_version in gossip_info.items())
        yield ClusterHealthValidatorEvent.NodeSchemaVersion(
            severity=Severity.WARNING,
            node=current_node.name,
            message=f"Current node {current_node}. "
            f"Schema version is not same on all nodes in gossip info: {gossip_info_str}",
        )

    # Validate that same schema on all nodes in the SYSTEM.PEERS
    schema_version_on_all_nodes = [
        values['schema_version'] for node, values in peers_details.items()
        if node in gossip_info and gossip_info[node]['status'] not in
        current_node.GOSSIP_STATUSES_FILTER_OUT
    ]

    if len(set(schema_version_on_all_nodes)) > 1:
        LOGGER.debug(debug_message)
        peers_info_str = '\n'.join(
            f"{node}: {schema_version['schema_version']}"
            for node, schema_version in peers_details.items())
        yield ClusterHealthValidatorEvent.NodeSchemaVersion(
            severity=Severity.WARNING,
            node=current_node.name,
            message=f"Current node {current_node}. "
            f"Schema version is not same on all nodes in SYSTEM.PEERS info: {peers_info_str}",
        )
    def test_cluster_health_validator_event_msgfmt(self):
        chc_event = ClusterHealthValidatorEvent()
        chc_event.publish_event = False
        chc_event.event_id = "7208cfbb-a083-4b7a-b0db-1982d88f6da0"
        chc_event.begin_event()
        self.assertEqual(
            str(chc_event),
            "(ClusterHealthValidatorEvent Severity.NORMAL) period_type=begin "
            "event_id=7208cfbb-a083-4b7a-b0db-1982d88f6da0")

        critical_event = ClusterHealthValidatorEvent.NodeStatus(
            severity=Severity.CRITICAL, node="n1", error="e1")
        critical_event.event_id = "712128d0-4837-4213-8a60-d6e2ec106c52"
        self.assertEqual(
            str(critical_event),
            "(ClusterHealthValidatorEvent Severity.CRITICAL) period_type=one-time "
            "event_id=712128d0-4837-4213-8a60-d6e2ec106c52: type=NodeStatus node=n1 error=e1"
        )
        self.assertEqual(critical_event,
                         pickle.loads(pickle.dumps(critical_event)))

        error_event = ClusterHealthValidatorEvent.NodePeersNulls(
            severity=Severity.ERROR, node="n2", error="e2")
        error_event.event_id = "712128d0-4837-4213-8a60-d6e2ec106c52"
        self.assertEqual(
            str(error_event),
            "(ClusterHealthValidatorEvent Severity.ERROR) period_type=one-time "
            "event_id=712128d0-4837-4213-8a60-d6e2ec106c52: type=NodePeersNulls node=n2 error=e2"
        )
        self.assertEqual(error_event, pickle.loads(pickle.dumps(error_event)))

        warning_event = ClusterHealthValidatorEvent.NodeSchemaVersion(
            severity=Severity.WARNING, node="n3", message="m3")
        warning_event.event_id = "712128d0-4837-4213-8a60-d6e2ec106c52"
        self.assertEqual(
            str(warning_event),
            "(ClusterHealthValidatorEvent Severity.WARNING) period_type=one-time "
            "event_id=712128d0-4837-4213-8a60-d6e2ec106c52: type=NodeSchemaVersion node=n3 message=m3"
        )
        self.assertEqual(warning_event,
                         pickle.loads(pickle.dumps(warning_event)))

        info_event = ClusterHealthValidatorEvent.NodesNemesis(
            severity=Severity.WARNING, node="n4", message="m4")
        info_event.event_id = "712128d0-4837-4213-8a60-d6e2ec106c52"
        self.assertEqual(
            str(info_event),
            "(ClusterHealthValidatorEvent Severity.WARNING) period_type=one-time "
            "event_id=712128d0-4837-4213-8a60-d6e2ec106c52: type=NodesNemesis node=n4 message=m4"
        )
        self.assertEqual(info_event, pickle.loads(pickle.dumps(info_event)))

        chc_event.message = "Cluster health check finished"
        chc_event.duration = 5
        chc_event.end_event()
        self.assertEqual(
            str(chc_event),
            "(ClusterHealthValidatorEvent Severity.NORMAL) period_type=end "
            "event_id=7208cfbb-a083-4b7a-b0db-1982d88f6da0 duration=5s message=Cluster health check finished"
        )