def test_controller_node_isolation(self): """ Isolate controller node, expect cluster to be available """ def controller_available(): return self.redpanda.controller() is not None admin = Admin(self.redpanda) # wait for controller wait_until(controller_available, timeout_sec=ELECTION_TIMEOUT * 2, backoff_sec=1) initial_leader_id, replicas = self._wait_for_leader() assert initial_leader_id == replicas[0] self._expect_available() allocator_info = admin.wait_stable_configuration( "id_allocator", namespace="kafka_internal", replication=3, timeout_s=ELECTION_TIMEOUT * 2) # isolate controller with FailureInjector(self.redpanda) as fi: controller_id = self.redpanda.idx( self.redpanda.controller().account.hostname) fi.inject_failure( FailureSpec(FailureSpec.FAILURE_ISOLATE, self.redpanda.controller())) if allocator_info.leader == controller_id: hosts = [ n.account.hostname for n in self.redpanda.nodes if self.redpanda.idx(n) != controller_id ] admin.await_stable_leader( "id_allocator", namespace="kafka_internal", replication=3, timeout_s=ELECTION_TIMEOUT * 2, hosts=hosts, check=lambda node_id: node_id != controller_id) connection = self.ping_pong() connection.ping_pong(timeout_s=10, retries=10) for i in range(0, 127): connection.ping_pong()
def test_one_node_down(self): """ Simplest HA test. Stop the leader for our partition. Validate that the cluster remains available afterwards, and that the expected peer takes over as the new leader. """ # Find which node is the leader admin = Admin(self.redpanda) initial_leader_id, replicas = self._wait_for_leader() assert initial_leader_id == replicas[0] self._expect_available() allocator_info = admin.wait_stable_configuration( "id_allocator", namespace="kafka_internal", replication=3, timeout_s=ELECTION_TIMEOUT * 2) leader_node = self.redpanda.get_node(initial_leader_id) self.logger.info( f"Initial leader {initial_leader_id} {leader_node.account.hostname}" ) self.logger.info(f"id_allocator leader {allocator_info.leader}") # Priority mechanism should reliably select next replica in list expect_new_leader_id = replicas[1] expect_new_leader_node = self.redpanda.get_node(expect_new_leader_id) observer_node_id = (set(replicas) - {expect_new_leader_id, initial_leader_id}).pop() observer_node = self.redpanda.get_node(observer_node_id) self.logger.info( f"Tracking stats on observer node {observer_node_id} {observer_node.account.hostname}" ) self.logger.info( f"Tracking stats on expected new leader node {expect_new_leader_id} {expect_new_leader_node.account.hostname}" ) observer_metrics = MetricCheck(self.logger, self.redpanda, observer_node, re.compile("vectorized_raft_.*"), {'topic': self.topic}) new_leader_metrics = MetricCheck(self.logger, self.redpanda, expect_new_leader_node, re.compile("vectorized_raft_.*"), {'topic': self.topic}) self.logger.info( f"Stopping node {initial_leader_id} ({leader_node.account.hostname})" ) self.redpanda.stop_node(leader_node) if allocator_info.leader == initial_leader_id: hosts = [ n.account.hostname for n in self.redpanda.nodes if self.redpanda.idx(n) != initial_leader_id ] admin.await_stable_leader( "id_allocator", namespace="kafka_internal", replication=3, timeout_s=ELECTION_TIMEOUT * 2, hosts=hosts, check=lambda node_id: node_id != initial_leader_id) new_leader, _ = self._wait_for_leader( lambda l: l == expect_new_leader_id) self.logger.info(f"Leadership moved to {new_leader}") self._expect_available() # Check that metrics have progressed in the expected direction. Not doing exact # value checks (for e.g. how many elections happened) because a sufficiently # noisy test environment can violate even quite long timeouts (e.g. the 1500ms # election timeout). # # It would be good to impose stricter checks, to detect bugs that manifest as # elections taking more iterations than expected, once we have a less contended # test environment to execute in. observer_metrics.expect([ ("vectorized_raft_leadership_changes_total", lambda a, b: b > a), ("vectorized_raft_leader_for", lambda a, b: int(b) == 0), ("vectorized_raft_received_vote_requests_total", lambda a, b: b > a), ]) new_leader_metrics.expect([ ("vectorized_raft_leadership_changes_total", lambda a, b: b > a), ("vectorized_raft_leader_for", lambda a, b: int(b) == 1), ("vectorized_raft_received_vote_requests_total", lambda a, b: b == a), ])