예제 #1
0
    def test_driver_recovers_nework_isolation(self):
        start_and_prime_singledc()

        idle_heartbeat_timeout = 3
        idle_heartbeat_interval = 1

        listener = TrackDownListener()

        cluster = Cluster(['127.0.0.1'],
                          load_balancing_policy=RoundRobinPolicy(),
                          idle_heartbeat_timeout=idle_heartbeat_timeout,
                          idle_heartbeat_interval=idle_heartbeat_interval,
                          executor_threads=16)
        session = cluster.connect(wait_for_all_pools=True)

        cluster.register_listener(listener)

        prime_request(PrimeOptions(then=NO_THEN))
        prime_request(RejectConnections(RejectType.REJECT_STARTUP))

        time.sleep((idle_heartbeat_timeout + idle_heartbeat_interval) * 2)

        for host in cluster.metadata.all_hosts():
            self.assertIn(host, listener.hosts_marked_down)

        self.assertRaises(NoHostAvailable, session.execute,
                          "SELECT * from system.local")

        clear_queries()
        prime_request(AcceptConnections())

        time.sleep(idle_heartbeat_timeout + idle_heartbeat_interval + 2)

        self.assertIsNotNone(session.execute("SELECT * from system.local"))
예제 #2
0
    def test_heart_beat_timeout(self):
        """
        Test to ensure the hosts are marked as down after a OTO is received.
        Also to ensure this happens within the expected timeout
        @since 3.10
        @jira_ticket PYTHON-762
        @expected_result all the hosts have been marked as down at some point

        @test_category metadata
        """
        number_of_dcs = 3
        nodes_per_dc = 20

        query_to_prime = "INSERT INTO test3rf.test (k, v) VALUES (0, 1);"

        idle_heartbeat_timeout = 5
        idle_heartbeat_interval = 1

        start_and_prime_cluster_defaults(number_of_dcs, nodes_per_dc)

        listener = TrackDownListener()
        executor = ThreadTracker(max_workers=8)

        # We need to disable compression since it's not supported in simulacron
        cluster = Cluster(
            compression=False,
            idle_heartbeat_interval=idle_heartbeat_interval,
            idle_heartbeat_timeout=idle_heartbeat_timeout,
            executor_threads=8,
            execution_profiles={
                EXEC_PROFILE_DEFAULT:
                ExecutionProfile(load_balancing_policy=RoundRobinPolicy())
            })
        self.addCleanup(cluster.shutdown)

        cluster.scheduler.shutdown()
        cluster.executor = executor
        cluster.scheduler = _Scheduler(executor)

        session = cluster.connect(wait_for_all_pools=True)
        cluster.register_listener(listener)

        log = logging.getLogger()
        log.setLevel('CRITICAL')
        self.addCleanup(log.setLevel, "DEBUG")

        prime_query(query_to_prime, then=NO_THEN)

        futures = []
        for _ in range(number_of_dcs * nodes_per_dc):
            future = session.execute_async(query_to_prime)
            futures.append(future)

        for f in futures:
            f._event.wait()
            self.assertIsInstance(f._final_exception, OperationTimedOut)

        prime_request(PrimeOptions(then=NO_THEN))

        # We allow from some extra time for all the hosts to be to on_down
        # The callbacks should start happening after idle_heartbeat_timeout + idle_heartbeat_interval
        time.sleep((idle_heartbeat_timeout + idle_heartbeat_interval) * 2.5)

        for host in cluster.metadata.all_hosts():
            self.assertIn(host, listener.hosts_marked_down)

        # In this case HostConnection._replace shouldn't be called
        self.assertNotIn("_replace", executor.called_functions)
예제 #3
0
    def test_retry_after_defunct(self):
        """
        We test cluster._retry is called if an the connection is defunct
        in the middle of a query

        Finally we verify the driver recovers correctly in the event
        of a network partition

        @since 3.12
        @expected_result the driver is able to query even if a host is marked
        as down in the middle of the query, it will go to the next one if the timeout
        hasn't expired

        @test_category connection
        """
        number_of_dcs = 3
        nodes_per_dc = 2

        query_to_prime = "INSERT INTO test3rf.test (k, v) VALUES (0, 1);"

        idle_heartbeat_timeout = 1
        idle_heartbeat_interval = 5

        simulacron_cluster = start_and_prime_cluster_defaults(
            number_of_dcs, nodes_per_dc)

        dc_ids = sorted(simulacron_cluster.data_center_ids)
        last_host = dc_ids.pop()
        prime_query(query_to_prime,
                    cluster_name="{}/{}".format(
                        simulacron_cluster.cluster_name, last_host))

        roundrobin_lbp = OrderedRoundRobinPolicy()
        cluster = Cluster(
            compression=False,
            idle_heartbeat_interval=idle_heartbeat_interval,
            idle_heartbeat_timeout=idle_heartbeat_timeout,
            execution_profiles={
                EXEC_PROFILE_DEFAULT:
                ExecutionProfile(load_balancing_policy=roundrobin_lbp)
            })

        session = cluster.connect(wait_for_all_pools=True)
        self.addCleanup(cluster.shutdown)

        # This simulates we only have access to one DC
        for dc_id in dc_ids:
            datacenter_path = "{}/{}".format(simulacron_cluster.cluster_name,
                                             dc_id)
            prime_query(query_to_prime,
                        then=NO_THEN,
                        cluster_name=datacenter_path)
            prime_request(
                PrimeOptions(then=NO_THEN, cluster_name=datacenter_path))

        # Only the last datacenter will respond, therefore the first host won't
        # We want to make sure the returned hosts are 127.0.0.1,  127.0.0.2, ... 127.0.0.8
        roundrobin_lbp._position = 0

        # After 3 + 1 seconds the connection should be marked and down and another host retried
        response_future = session.execute_async(
            query_to_prime,
            timeout=4 * idle_heartbeat_interval + idle_heartbeat_timeout)
        response_future.result()
        self.assertGreater(len(response_future.attempted_hosts), 1)

        # No error should be raised here since the hosts have been marked
        # as down and there's still 1 DC available
        for _ in range(10):
            session.execute(query_to_prime)

        # Might take some time to close the previous connections and reconnect
        time.sleep(10)
        assert_quiescent_pool_state(self, cluster)
        clear_queries()

        time.sleep(10)
        assert_quiescent_pool_state(self, cluster)
예제 #4
0
    def test_heartbeat_defunct_deadlock(self):
        """
        Ensure that there is no deadlock when request is in-flight and heartbeat defuncts connection
        @since 3.16
        @jira_ticket PYTHON-1044
        @expected_result an OperationTimeout is raised and no deadlock occurs

        @test_category connection
        """
        start_and_prime_singledc()

        # This is all about timing. We will need the QUERY response future to time out and the heartbeat to defunct
        # at the same moment. The latter will schedule a QUERY retry to another node in case the pool is not
        # already shut down.  If and only if the response future timeout falls in between the retry scheduling and
        # its execution the deadlock occurs. The odds are low, so we need to help fate a bit:
        # 1) Make one heartbeat messages be sent to every node
        # 2) Our QUERY goes always to the same host
        # 3) This host needs to defunct first
        # 4) Open a small time window for the response future timeout, i.e. block executor threads for retry
        #    execution and last connection to defunct
        query_to_prime = "SELECT * from testkesypace.testtable"
        query_host = "127.0.0.2"
        heartbeat_interval = 1
        heartbeat_timeout = 1
        lag = 0.05
        never = 9999

        class PatchedRoundRobinPolicy(RoundRobinPolicy):
            # Send always to same host
            def make_query_plan(self, working_keyspace=None, query=None):
                if query and query.query_string == query_to_prime:
                    return filter(lambda h: h == query_host, self._live_hosts)
                else:
                    return super(PatchedRoundRobinPolicy,
                                 self).make_query_plan()

        class PatchedCluster(Cluster):
            # Make sure that QUERY connection will timeout first
            def get_connection_holders(self):
                holders = super(PatchedCluster, self).get_connection_holders()
                return sorted(
                    holders,
                    reverse=True,
                    key=lambda v: int(v._connection.host == query_host))

            # Block executor thread like closing a dead socket could do
            def connection_factory(self, *args, **kwargs):
                conn = super(PatchedCluster,
                             self).connection_factory(*args, **kwargs)
                conn.defunct = late(seconds=2 * lag)(conn.defunct)
                return conn

        cluster = PatchedCluster(
            protocol_version=PROTOCOL_VERSION,
            compression=False,
            idle_heartbeat_interval=heartbeat_interval,
            idle_heartbeat_timeout=heartbeat_timeout,
            load_balancing_policy=PatchedRoundRobinPolicy())
        session = cluster.connect()
        self.addCleanup(cluster.shutdown)

        prime_query(query_to_prime, then={"delay_in_ms": never})

        # Make heartbeat due
        time.sleep(heartbeat_interval)

        future = session.execute_async(query_to_prime,
                                       timeout=heartbeat_interval +
                                       heartbeat_timeout + 3 * lag)
        # Delay thread execution like kernel could do
        future._retry_task = late(seconds=4 * lag)(future._retry_task)

        prime_request(
            PrimeOptions(then={
                "result": "no_result",
                "delay_in_ms": never
            }))
        prime_request(RejectConnections("unbind"))

        self.assertRaisesRegexp(OperationTimedOut,
                                "Connection defunct by heartbeat",
                                future.result)