def test_bootstrap_waits_for_streaming_to_finish(self): """ Test that bootstrap completes and is marked as such after streaming finishes. """ cluster = self.cluster cluster.set_environment_variable('CASSANDRA_TOKEN_PREGENERATION_DISABLED', 'True') logger.debug("Create a cluster") cluster.populate(1) node1 = cluster.nodelist()[0] logger.debug("Start node 1") node1.start(wait_for_binary_proto=True) logger.debug("Insert 10k rows") node1.stress(['write', 'n=10K', 'no-warmup', '-rate', 'threads=8', '-schema', 'replication(factor=2)']) logger.debug("Bootstrap node 2 with delay") node2 = new_node(cluster, byteman_port='4200') node2.update_startup_byteman_script('./byteman/bootstrap_5s_sleep.btm') node2.start(wait_for_binary_proto=True) assert_bootstrap_state(self, node2, 'COMPLETED') assert node2.grep_log('Bootstrap completed', filename='debug.log')
def test_bootstrap_with_reset_bootstrap_state(self): """Test bootstrap with resetting bootstrap progress""" cluster = self.cluster cluster.set_configuration_options(values={'stream_throughput_outbound_megabits_per_sec': 1}) cluster.populate(2).start(wait_other_notice=True) node1 = cluster.nodes['node1'] node1.stress(['write', 'n=100K', '-schema', 'replication(factor=2)']) node1.flush() # kill node1 in the middle of streaming to let it fail t = InterruptBootstrap(node1) t.start() # start bootstrapping node3 and wait for streaming node3 = new_node(cluster) try: node3.start() except NodeError: pass # node doesn't start as expected t.join() node1.start() # restart node3 bootstrap with resetting bootstrap progress node3.stop(signal_event=signal.SIGKILL) mark = node3.mark_log() node3.start(jvm_args=["-Dcassandra.reset_bootstrap_progress=true"]) # check if we reset bootstrap state node3.watch_log_for("Resetting bootstrap progress to start fresh", from_mark=mark) # wait for node3 ready to query node3.wait_for_binary_interface(from_mark=mark) # check if 2nd bootstrap succeeded assert_bootstrap_state(self, node3, 'COMPLETED')
def test_multi_dc_replace_with_rf1(self): """ Test that multi-dc replace works when rf=1 on each dc """ self._setup(n=[1, 1]) yaml_config = """ # Create the keyspace and table keyspace: keyspace1 keyspace_definition: | CREATE KEYSPACE keyspace1 WITH replication = {'class': 'NetworkTopologyStrategy', 'dc1': 1, 'dc2': 1}; table: users table_definition: CREATE TABLE users ( username text, first_name text, last_name text, email text, PRIMARY KEY(username) ) WITH compaction = {'class':'SizeTieredCompactionStrategy'}; insert: partitions: fixed(1) batchtype: UNLOGGED queries: read: cql: select * from users where username = ? fields: samerow """ with tempfile.NamedTemporaryFile(mode='w+') as stress_config: stress_config.write(yaml_config) stress_config.flush() self.query_node.stress([ 'user', 'profile=' + stress_config.name, 'n=10k', 'no-warmup', 'ops(insert=1)', '-rate', 'threads=5' ]) # need to sleep for a bit to try and let things catch up as we frequently do a lot of # GC after the stress invocation above causing the next step of the test to timeout. # and then flush to make sure we really are fully caught up time.sleep(30) # Save initial data table_name = 'keyspace1.users' initial_data = self._fetch_initial_data(table=table_name, cl=ConsistencyLevel.TWO) self._stop_node_to_replace(table=table_name) self._do_replace(data_center='dc2') assert_bootstrap_state(self, self.replacement_node, 'COMPLETED') # Check that keyspace was replicated from dc1 to dc2 assert not self.replacement_node.grep_log( "Unable to find sufficient sources for streaming range") self._verify_data(initial_data, table=table_name, cl=ConsistencyLevel.LOCAL_ONE)
def _bootstrap_test_with_replica_down(self, consistent_range_movement, rf=2): """ Test to check consistent bootstrap will not succeed when there are insufficient replicas @jira_ticket CASSANDRA-11848 """ cluster = self.cluster cluster.set_environment_variable('CASSANDRA_TOKEN_PREGENERATION_DISABLED', 'True') cluster.populate(2) node1, node2 = cluster.nodelist() node3_token = None # Make token assignment deterministic if not self.dtest_config.use_vnodes: cluster.set_configuration_options(values={'num_tokens': 1}) tokens = cluster.balanced_tokens(3) logger.debug("non-vnode tokens: %r" % (tokens,)) node1.set_configuration_options(values={'initial_token': tokens[0]}) node2.set_configuration_options(values={'initial_token': tokens[2]}) node3_token = tokens[1] # Add node 3 between node1 and node2 cluster.start() node1.stress(['write', 'n=10K', 'no-warmup', '-rate', 'threads=8', '-schema', 'replication(factor={})'.format(rf)]) # change system_auth keyspace to 2 (default is 1) to avoid # "Unable to find sufficient sources for streaming" warning if cluster.cassandra_version() >= '2.2.0': session = self.patient_cql_connection(node1) session.execute(""" ALTER KEYSPACE system_auth WITH replication = {'class':'SimpleStrategy', 'replication_factor':2}; """) # Stop node2, so node3 will not be able to perform consistent range movement node2.stop(wait_other_notice=True) successful_bootstrap_expected = not consistent_range_movement node3 = new_node(cluster, token=node3_token) node3.start(wait_for_binary_proto=successful_bootstrap_expected, wait_other_notice=successful_bootstrap_expected, jvm_args=["-Dcassandra.consistent.rangemovement={}".format(consistent_range_movement)]) if successful_bootstrap_expected: # with rf=1 and cassandra.consistent.rangemovement=false, missing sources are ignored if not consistent_range_movement and rf == 1: node3.watch_log_for("Unable to find sufficient sources for streaming range") assert node3.is_running() assert_bootstrap_state(self, node3, 'COMPLETED') else: if consistent_range_movement: if cluster.version() < '4.0': node3.watch_log_for("A node required to move the data consistently is down") else: node3.watch_log_for("Necessary replicas for strict consistency were removed by source filters") else: node3.watch_log_for("Unable to find sufficient sources for streaming range") assert_not_running(node3)
def _bootstrap_test_with_replica_down(self, consistent_range_movement, rf=2): """ Test to check consistent bootstrap will not succeed when there are insufficient replicas @jira_ticket CASSANDRA-11848 """ cluster = self.cluster cluster.populate(2) node1, node2 = cluster.nodelist() node3_token = None # Make token assignment deterministic if not self.dtest_config.use_vnodes: cluster.set_configuration_options(values={'num_tokens': 1}) tokens = cluster.balanced_tokens(3) logger.debug("non-vnode tokens: %r" % (tokens,)) node1.set_configuration_options(values={'initial_token': tokens[0]}) node2.set_configuration_options(values={'initial_token': tokens[2]}) node3_token = tokens[1] # Add node 3 between node1 and node2 cluster.start() node1.stress(['write', 'n=10K', 'no-warmup', '-rate', 'threads=8', '-schema', 'replication(factor={})'.format(rf)]) # change system_auth keyspace to 2 (default is 1) to avoid # "Unable to find sufficient sources for streaming" warning if cluster.cassandra_version() >= '2.2.0': session = self.patient_cql_connection(node1) session.execute(""" ALTER KEYSPACE system_auth WITH replication = {'class':'SimpleStrategy', 'replication_factor':2}; """) # Stop node2, so node3 will not be able to perform consistent range movement node2.stop(wait_other_notice=True) successful_bootstrap_expected = not consistent_range_movement node3 = new_node(cluster, token=node3_token) node3.start(wait_for_binary_proto=successful_bootstrap_expected, wait_other_notice=successful_bootstrap_expected, jvm_args=["-Dcassandra.consistent.rangemovement={}".format(consistent_range_movement)]) if successful_bootstrap_expected: # with rf=1 and cassandra.consistent.rangemovement=false, missing sources are ignored if not consistent_range_movement and rf == 1: node3.watch_log_for("Unable to find sufficient sources for streaming range") assert node3.is_running() assert_bootstrap_state(self, node3, 'COMPLETED') else: if consistent_range_movement: if cluster.version() < '4.0': node3.watch_log_for("A node required to move the data consistently is down") else: node3.watch_log_for("Necessary replicas for strict consistency were removed by source filters") else: node3.watch_log_for("Unable to find sufficient sources for streaming range") assert_not_running(node3)
def _test_restart_failed_replace(self, mode): self.ignore_log_patterns = list(self.ignore_log_patterns) + [r'Error while waiting on bootstrap to complete'] self._setup(n=3, enable_byteman=True) self._insert_data(n="1k") initial_data = self._fetch_initial_data() self._stop_node_to_replace() debug("Submitting byteman script to make stream fail") self.query_node.byteman_submit(['./byteman/stream_failure.btm']) self._do_replace(jvm_option='replace_address_first_boot', opts={'streaming_socket_timeout_in_ms': 1000}) # Make sure bootstrap did not complete successfully assert_bootstrap_state(self, self.replacement_node, 'IN_PROGRESS') if mode == 'reset_resume_state': mark = self.replacement_node.mark_log() debug("Restarting replacement node with -Dcassandra.reset_bootstrap_progress=true") # restart replacement node with resetting bootstrap state self.replacement_node.stop() self.replacement_node.start(jvm_args=[ "-Dcassandra.replace_address_first_boot={}".format(self.replaced_node.address()), "-Dcassandra.reset_bootstrap_progress=true" ], wait_for_binary_proto=True) # check if we reset bootstrap state self.replacement_node.watch_log_for("Resetting bootstrap progress to start fresh", from_mark=mark) elif mode == 'resume': debug("Resuming failed bootstrap") self.replacement_node.nodetool('bootstrap resume') # check if we skipped already retrieved ranges self.replacement_node.watch_log_for("already available. Skipping streaming.") self.replacement_node.watch_log_for("Resume complete") elif mode == 'wipe': self.replacement_node.stop() debug("Waiting other nodes to detect node stopped") self.query_node.watch_log_for("FatClient /{} has been silent for 30000ms, removing from gossip".format(self.replacement_node.address()), timeout=60) self.query_node.watch_log_for("Node /{} failed during replace.".format(self.replacement_node.address()), timeout=60, filename='debug.log') debug("Restarting node after wiping data") self._cleanup(self.replacement_node) self.replacement_node.start(jvm_args=["-Dcassandra.replace_address_first_boot={}".format(self.replaced_node.address())], wait_for_binary_proto=True) else: raise RuntimeError('invalid mode value {mode}'.format(mode)) # check if bootstrap succeeded assert_bootstrap_state(self, self.replacement_node, 'COMPLETED') debug("Bootstrap finished successully, verifying data.") self._verify_data(initial_data)
def test_resumable_bootstrap(self): """ Test resuming bootstrap after data streaming failure """ cluster = self.cluster cluster.populate(2) node1 = cluster.nodes['node1'] # set up byteman node1.byteman_port = '8100' node1.import_config_files() cluster.start(wait_other_notice=True) # kill stream to node3 in the middle of streaming to let it fail if cluster.version() < '4.0': node1.byteman_submit(['./byteman/pre4.0/stream_failure.btm']) else: node1.byteman_submit(['./byteman/4.0/stream_failure.btm']) node1.stress([ 'write', 'n=1K', 'no-warmup', 'cl=TWO', '-schema', 'replication(factor=2)', '-rate', 'threads=50' ]) cluster.flush() # start bootstrapping node3 and wait for streaming node3 = new_node(cluster) node3.start(wait_other_notice=False, wait_for_binary_proto=True) # wait for node3 ready to query node3.watch_log_for("Starting listening for CQL clients") mark = node3.mark_log() # check if node3 is still in bootstrap mode retry_till_success(assert_bootstrap_state, tester=self, node=node3, expected_bootstrap_state='IN_PROGRESS', timeout=120) # bring back node1 and invoke nodetool bootstrap to resume bootstrapping node3.nodetool('bootstrap resume') node3.watch_log_for("Resume complete", from_mark=mark) assert_bootstrap_state(self, node3, 'COMPLETED') # cleanup to guarantee each node will only have sstables of its ranges cluster.cleanup() logger.debug("Check data is present") # Let's check stream bootstrap completely transferred data stdout, stderr, _ = node3.stress([ 'read', 'n=1k', 'no-warmup', '-schema', 'replication(factor=2)', '-rate', 'threads=8' ]) if stdout is not None: assert "FAILURE" not in stdout
def bootstrap_on_write_survey_and_join(cluster, token): node2 = new_node(cluster) node2.set_configuration_options(values={'initial_token': token}) node2.start(jvm_args=["-Dcassandra.write_survey=true"], wait_for_binary_proto=True) assert len(node2.grep_log('Startup complete, but write survey mode is active, not becoming an active ring member.')) assert_bootstrap_state(self, node2, 'IN_PROGRESS') node2.nodetool("join") assert len(node2.grep_log('Leaving write survey mode and joining ring at operator request')) return node2
def test_multi_dc_replace_with_rf1(self): """ Test that multi-dc replace works when rf=1 on each dc """ self._setup(n=[1, 1]) yaml_config = """ # Create the keyspace and table keyspace: keyspace1 keyspace_definition: | CREATE KEYSPACE keyspace1 WITH replication = {'class': 'NetworkTopologyStrategy', 'dc1': 1, 'dc2': 1}; table: users table_definition: CREATE TABLE users ( username text, first_name text, last_name text, email text, PRIMARY KEY(username) ) WITH compaction = {'class':'SizeTieredCompactionStrategy'}; insert: partitions: fixed(1) batchtype: UNLOGGED queries: read: cql: select * from users where username = ? fields: samerow """ with tempfile.NamedTemporaryFile(mode='w+') as stress_config: stress_config.write(yaml_config) stress_config.flush() self.query_node.stress(['user', 'profile=' + stress_config.name, 'n=10k', 'no-warmup', 'ops(insert=1)', '-rate', 'threads=5']) # need to sleep for a bit to try and let things catch up as we frequently do a lot of # GC after the stress invocation above causing the next step of the test to timeout. # and then flush to make sure we really are fully caught up time.sleep(30) # Save initial data table_name = 'keyspace1.users' initial_data = self._fetch_initial_data(table=table_name, cl=ConsistencyLevel.TWO) self._stop_node_to_replace(table=table_name) self._do_replace(data_center='dc2') assert_bootstrap_state(self, self.replacement_node, 'COMPLETED') # Check that keyspace was replicated from dc1 to dc2 assert not self.replacement_node.grep_log("Unable to find sufficient sources for streaming range") self._verify_data(initial_data, table=table_name, cl=ConsistencyLevel.LOCAL_ONE)
def test_resumable_bootstrap(self): """ Test resuming bootstrap after data streaming failure """ cluster = self.cluster cluster.set_environment_variable( 'CASSANDRA_TOKEN_PREGENERATION_DISABLED', 'True') cluster.populate(2) node1 = cluster.nodes['node1'] # set up byteman node1.byteman_port = '8100' node1.import_config_files() cluster.start() # kill stream to node3 in the middle of streaming to let it fail if cluster.version() < '4.0': node1.byteman_submit([self.byteman_submit_path_pre_4_0]) else: node1.byteman_submit([self.byteman_submit_path_4_0]) node1.stress([ 'write', 'n=1K', 'no-warmup', 'cl=TWO', '-schema', 'replication(factor=2)', '-rate', 'threads=50' ]) cluster.flush() # start bootstrapping node3 and wait for streaming node3 = new_node(cluster) node3.start(wait_other_notice=False) # let streaming fail as we expect node3.watch_log_for('Some data streaming failed') # bring back node3 and invoke nodetool bootstrap to resume bootstrapping node3.nodetool('bootstrap resume') node3.wait_for_binary_interface() assert_bootstrap_state(self, node3, 'COMPLETED') # cleanup to guarantee each node will only have sstables of its ranges cluster.cleanup() logger.debug("Check data is present") # Let's check stream bootstrap completely transferred data stdout, stderr, _ = node3.stress([ 'read', 'n=1k', 'no-warmup', '-schema', 'replication(factor=2)', '-rate', 'threads=8' ]) if stdout is not None: assert "FAILURE" not in stdout
def test_simple_bootstrap_small_keepalive_period(self): """ @jira_ticket CASSANDRA-11841 Test that bootstrap completes if it takes longer than streaming_socket_timeout_in_ms or 2*streaming_keep_alive_period_in_secs to receive a single sstable """ cluster = self.cluster yaml_opts = {'streaming_keep_alive_period_in_secs': 2} if cluster.version() < '4.0': yaml_opts['streaming_socket_timeout_in_ms'] = 1000 cluster.set_configuration_options(values=yaml_opts) # Create a single node cluster cluster.populate(1) node1 = cluster.nodelist()[0] logger.debug("Setting up byteman on {}".format(node1.name)) # set up byteman node1.byteman_port = '8100' node1.import_config_files() cluster.start() # Create more than one sstable larger than 1MB node1.stress([ 'write', 'n=1K', '-rate', 'threads=8', '-schema', 'compaction(strategy=SizeTieredCompactionStrategy, enabled=false)' ]) cluster.flush() logger.debug("Submitting byteman script to {} to".format(node1.name)) # Sleep longer than streaming_socket_timeout_in_ms to make sure the node will not be killed node1.byteman_submit([mk_bman_path('stream_5s_sleep.btm')]) # Bootstraping a new node with very small streaming_socket_timeout_in_ms node2 = new_node(cluster) node2.start(wait_for_binary_proto=True) # Shouldn't fail due to streaming socket timeout timeout assert_bootstrap_state(self, node2, 'COMPLETED') if cluster.version() < '4.0': for node in cluster.nodelist(): assert node.grep_log( 'Scheduling keep-alive task with 2s period.', filename='debug.log') assert node.grep_log('Sending keep-alive', filename='debug.log') assert node.grep_log('Received keep-alive', filename='debug.log')
def test_simple_bootstrap_nodata(self): """ @jira_ticket CASSANDRA-11010 Test that bootstrap completes if streaming from nodes with no data """ cluster = self.cluster # Create a two-node cluster cluster.populate(2) cluster.start(wait_other_notice=True) # Bootstrapping a new node node3 = new_node(cluster) node3.start(wait_for_binary_proto=True, wait_other_notice=True) assert_bootstrap_state(self, node3, 'COMPLETED')
def test_simple_bootstrap_nodata(self): """ @jira_ticket CASSANDRA-11010 Test that bootstrap completes if streaming from nodes with no data """ cluster = self.cluster cluster.set_environment_variable('CASSANDRA_TOKEN_PREGENERATION_DISABLED', 'True') # Create a two-node cluster cluster.populate(2) cluster.start() # Bootstrapping a new node node3 = new_node(cluster) node3.start(wait_for_binary_proto=True) assert_bootstrap_state(self, node3, 'COMPLETED')
def resumable_bootstrap_test(self): """ Test resuming bootstrap after data streaming failure """ cluster = self.cluster cluster.populate(2) node1 = cluster.nodes['node1'] # set up byteman node1.byteman_port = '8100' node1.import_config_files() cluster.start(wait_other_notice=True) # kill stream to node3 in the middle of streaming to let it fail if cluster.version() < '4.0': node1.byteman_submit(['./byteman/pre4.0/stream_failure.btm']) else: node1.byteman_submit(['./byteman/4.0/stream_failure.btm']) node1.stress(['write', 'n=1K', 'no-warmup', 'cl=TWO', '-schema', 'replication(factor=2)', '-rate', 'threads=50']) cluster.flush() # start bootstrapping node3 and wait for streaming node3 = new_node(cluster) node3.start(wait_other_notice=False, wait_for_binary_proto=True) # wait for node3 ready to query node3.watch_log_for("Starting listening for CQL clients") mark = node3.mark_log() # check if node3 is still in bootstrap mode assert_bootstrap_state(self, node3, 'IN_PROGRESS') # bring back node1 and invoke nodetool bootstrap to resume bootstrapping node3.nodetool('bootstrap resume') node3.watch_log_for("Resume complete", from_mark=mark) assert_bootstrap_state(self, node3, 'COMPLETED') # cleanup to guarantee each node will only have sstables of its ranges cluster.cleanup() debug("Check data is present") # Let's check stream bootstrap completely transferred data stdout, stderr, _ = node3.stress(['read', 'n=1k', 'no-warmup', '-schema', 'replication(factor=2)', '-rate', 'threads=8']) if stdout is not None: self.assertNotIn("FAILURE", stdout)
def simple_bootstrap_test_small_keepalive_period(self): """ @jira_ticket CASSANDRA-11841 Test that bootstrap completes if it takes longer than streaming_socket_timeout_in_ms or 2*streaming_keep_alive_period_in_secs to receive a single sstable """ cluster = self.cluster cluster.set_configuration_options( values={ 'stream_throughput_outbound_megabits_per_sec': 1, 'streaming_socket_timeout_in_ms': 1000, 'streaming_keep_alive_period_in_secs': 1 }) # Create a single node cluster cluster.populate(1) node1 = cluster.nodelist()[0] cluster.start(wait_other_notice=True) # Create more than one sstable larger than 1MB node1.stress([ 'write', 'n=50K', '-rate', 'threads=8', '-schema', 'compaction(strategy=SizeTieredCompactionStrategy, enabled=false)' ]) cluster.flush() node1.stress([ 'write', 'n=50K', '-rate', 'threads=8', '-schema', 'compaction(strategy=SizeTieredCompactionStrategy, enabled=false)' ]) cluster.flush() self.assertGreater(node1.get_sstables("keyspace1", "standard1"), 1) # Bootstraping a new node with very small streaming_socket_timeout_in_ms node2 = new_node(cluster) node2.start(wait_for_binary_proto=True) # Shouldn't fail due to streaming socket timeout timeout assert_bootstrap_state(self, node2, 'COMPLETED') for node in cluster.nodelist(): self.assertTrue( node.grep_log('Scheduling keep-alive task with 1s period.', filename='debug.log')) self.assertTrue( node.grep_log('Sending keep-alive', filename='debug.log')) self.assertTrue( node.grep_log('Received keep-alive', filename='debug.log'))
def test_simple_bootstrap_small_keepalive_period(self): """ @jira_ticket CASSANDRA-11841 Test that bootstrap completes if it takes longer than streaming_socket_timeout_in_ms or 2*streaming_keep_alive_period_in_secs to receive a single sstable """ cluster = self.cluster yaml_opts = {'streaming_keep_alive_period_in_secs': 2} if cluster.version() < '4.0': yaml_opts['streaming_socket_timeout_in_ms'] = 1000 cluster.set_configuration_options(values=yaml_opts) # Create a single node cluster cluster.populate(1) node1 = cluster.nodelist()[0] logger.debug("Setting up byteman on {}".format(node1.name)) # set up byteman node1.byteman_port = '8100' node1.import_config_files() cluster.start(wait_other_notice=True) # Create more than one sstable larger than 1MB node1.stress(['write', 'n=1K', '-rate', 'threads=8', '-schema', 'compaction(strategy=SizeTieredCompactionStrategy, enabled=false)']) cluster.flush() logger.debug("Submitting byteman script to {} to".format(node1.name)) # Sleep longer than streaming_socket_timeout_in_ms to make sure the node will not be killed node1.byteman_submit(['./byteman/stream_5s_sleep.btm']) # Bootstraping a new node with very small streaming_socket_timeout_in_ms node2 = new_node(cluster) node2.start(wait_for_binary_proto=True) # Shouldn't fail due to streaming socket timeout timeout assert_bootstrap_state(self, node2, 'COMPLETED') for node in cluster.nodelist(): assert node.grep_log('Scheduling keep-alive task with 2s period.', filename='debug.log') assert node.grep_log('Sending keep-alive', filename='debug.log') assert node.grep_log('Received keep-alive', filename='debug.log')
def _base_bootstrap_test(self, bootstrap=None, bootstrap_from_version=None): def default_bootstrap(cluster, token): node2 = new_node(cluster) node2.set_configuration_options(values={'initial_token': token}) node2.start(wait_for_binary_proto=True) return node2 if bootstrap is None: bootstrap = default_bootstrap cluster = self.cluster tokens = cluster.balanced_tokens(2) cluster.set_configuration_options(values={'num_tokens': 1}) debug("[node1, node2] tokens: %r" % (tokens,)) keys = 10000 # Create a single node cluster cluster.populate(1) node1 = cluster.nodelist()[0] if bootstrap_from_version: debug("starting source node on version {}".format(bootstrap_from_version)) node1.set_install_dir(version=bootstrap_from_version) node1.set_configuration_options(values={'initial_token': tokens[0]}) cluster.start(wait_other_notice=True) session = self.patient_cql_connection(node1) self.create_ks(session, 'ks', 1) self.create_cf(session, 'cf', columns={'c1': 'text', 'c2': 'text'}) # record the size before inserting any of our own data empty_size = node1.data_size() debug("node1 empty size : %s" % float(empty_size)) insert_statement = session.prepare("INSERT INTO ks.cf (key, c1, c2) VALUES (?, 'value1', 'value2')") execute_concurrent_with_args(session, insert_statement, [['k%d' % k] for k in range(keys)]) node1.flush() node1.compact() initial_size = node1.data_size() debug("node1 size before bootstrapping node2: %s" % float(initial_size)) # Reads inserted data all during the bootstrap process. We shouldn't # get any error reader = self.go(lambda _: query_c1c2(session, random.randint(0, keys - 1), ConsistencyLevel.ONE)) # Bootstrapping a new node in the current version node2 = bootstrap(cluster, tokens[1]) node2.compact() reader.check() node1.cleanup() debug("node1 size after cleanup: %s" % float(node1.data_size())) node1.compact() debug("node1 size after compacting: %s" % float(node1.data_size())) time.sleep(.5) reader.check() debug("node2 size after compacting: %s" % float(node2.data_size())) size1 = float(node1.data_size()) size2 = float(node2.data_size()) assert_almost_equal(size1, size2, error=0.3) assert_almost_equal(float(initial_size - empty_size), 2 * (size1 - float(empty_size))) assert_bootstrap_state(self, node2, 'COMPLETED') if bootstrap_from_version: self.assertTrue(node2.grep_log('does not support keep-alive', filename='debug.log'))
def _test_restart_failed_replace(self, mode): self.fixture_dtest_setup.ignore_log_patterns = list( self.fixture_dtest_setup.ignore_log_patterns) + [ r'Error while waiting on bootstrap to complete' ] self._setup(n=3, enable_byteman=True) self._insert_data(n="1k") initial_data = self._fetch_initial_data() self._stop_node_to_replace() logger.debug("Submitting byteman script to make stream fail") btmmark = self.query_node.mark_log() if self.cluster.version() < '4.0': self.query_node.byteman_submit( [mk_bman_path('pre4.0/stream_failure.btm')]) self._do_replace(jvm_option='replace_address_first_boot', opts={'streaming_socket_timeout_in_ms': 1000}, wait_for_binary_proto=False, wait_other_notice=True) else: self.query_node.byteman_submit( [mk_bman_path('4.0/stream_failure.btm')]) self._do_replace(jvm_option='replace_address_first_boot', wait_for_binary_proto=False, wait_other_notice=True) # Make sure bootstrap did not complete successfully self.query_node.watch_log_for("Triggering network failure", from_mark=btmmark) self.query_node.watch_log_for("Stream failed", from_mark=btmmark) self.replacement_node.watch_log_for("Stream failed") self.replacement_node.watch_log_for( "Some data streaming failed.*IN_PROGRESS$") if mode == 'reset_resume_state': mark = self.replacement_node.mark_log() logger.debug( "Restarting replacement node with -Dcassandra.reset_bootstrap_progress=true" ) # restart replacement node with resetting bootstrap state (with 180s timeout) self.replacement_node.stop() self.replacement_node.start(jvm_args=[ "-Dcassandra.replace_address_first_boot={}".format( self.replaced_node.address()), "-Dcassandra.reset_bootstrap_progress=true" ], wait_for_binary_proto=180) # check if we reset bootstrap state self.replacement_node.watch_log_for( "Resetting bootstrap progress to start fresh", from_mark=mark) elif mode == 'resume': logger.debug("Resuming failed bootstrap") self.replacement_node.nodetool('bootstrap resume') # check if we skipped already retrieved ranges self.replacement_node.watch_log_for( "already available. Skipping streaming.") self.replacement_node.watch_log_for("Resume complete") elif mode == 'wipe': self.replacement_node.stop() logger.debug("Waiting other nodes to detect node stopped") node_log_str = self.replacement_node.address_for_current_version_slashy( ) self.query_node.watch_log_for( "FatClient {} has been silent for 30000ms, removing from gossip" .format(node_log_str), timeout=120) self.query_node.watch_log_for( "Node {} failed during replace.".format(node_log_str), timeout=120, filename='debug.log') logger.debug("Restarting node after wiping data") self._cleanup(self.replacement_node) self.replacement_node.start(jvm_args=[ "-Dcassandra.replace_address_first_boot={}".format( self.replaced_node.address()) ], wait_for_binary_proto=120) else: raise RuntimeError('invalid mode value {mode}'.format(mode=mode)) # check if bootstrap succeeded assert_bootstrap_state(self, self.replacement_node, 'COMPLETED') logger.debug("Bootstrap finished successfully, verifying data.") self._verify_data(initial_data)
def _base_bootstrap_test(self, bootstrap=None, bootstrap_from_version=None, enable_ssl=None): def default_bootstrap(cluster, token): node2 = new_node(cluster) node2.set_configuration_options(values={'initial_token': token}) node2.start(wait_for_binary_proto=True) return node2 if bootstrap is None: bootstrap = default_bootstrap cluster = self.cluster if enable_ssl: logger.debug("***using internode ssl***") generate_ssl_stores(self.fixture_dtest_setup.test_path) cluster.enable_internode_ssl(self.fixture_dtest_setup.test_path) tokens = cluster.balanced_tokens(2) cluster.set_configuration_options(values={'num_tokens': 1}) logger.debug("[node1, node2] tokens: %r" % (tokens, )) keys = 10000 # Create a single node cluster cluster.populate(1) node1 = cluster.nodelist()[0] if bootstrap_from_version: logger.debug("starting source node on version {}".format( bootstrap_from_version)) node1.set_install_dir(version=bootstrap_from_version) node1.set_configuration_options(values={'initial_token': tokens[0]}) cluster.start(wait_other_notice=True) session = self.patient_cql_connection(node1) create_ks(session, 'ks', 1) create_cf(session, 'cf', columns={'c1': 'text', 'c2': 'text'}) # record the size before inserting any of our own data empty_size = data_size(node1, 'ks', 'cf') logger.debug("node1 empty size for ks.cf: %s" % float(empty_size)) insert_statement = session.prepare( "INSERT INTO ks.cf (key, c1, c2) VALUES (?, 'value1', 'value2')") execute_concurrent_with_args(session, insert_statement, [['k%d' % k] for k in range(keys)]) node1.flush() node1.compact() initial_size = data_size(node1, 'ks', 'cf') logger.debug("node1 size for ks.cf before bootstrapping node2: %s" % float(initial_size)) # Reads inserted data all during the bootstrap process. We shouldn't # get any error query_c1c2(session, random.randint(0, keys - 1), ConsistencyLevel.ONE) session.shutdown() # Bootstrapping a new node in the current version node2 = bootstrap(cluster, tokens[1]) node2.compact() node1.cleanup() logger.debug("node1 size for ks.cf after cleanup: %s" % float(data_size(node1, 'ks', 'cf'))) node1.compact() logger.debug("node1 size for ks.cf after compacting: %s" % float(data_size(node1, 'ks', 'cf'))) logger.debug("node2 size for ks.cf after compacting: %s" % float(data_size(node2, 'ks', 'cf'))) size1 = float(data_size(node1, 'ks', 'cf')) size2 = float(data_size(node2, 'ks', 'cf')) assert_almost_equal(size1, size2, error=0.3) assert_almost_equal(float(initial_size - empty_size), 2 * (size1 - float(empty_size))) assert_bootstrap_state(self, node2, 'COMPLETED')
def test_bootstrap_binary_disabled(self): """ Test binary while bootstrapping and streaming fails @jira_ticket CASSANDRA-14526, CASSANDRA-14525 """ config = { 'authenticator': 'org.apache.cassandra.auth.PasswordAuthenticator', 'authorizer': 'org.apache.cassandra.auth.CassandraAuthorizer', 'role_manager': 'org.apache.cassandra.auth.CassandraRoleManager', 'permissions_validity_in_ms': 0, 'roles_validity_in_ms': 0 } cluster = self.cluster cluster.populate(1) node1 = cluster.nodes['node1'] # set up byteman node1.byteman_port = '8100' node1.import_config_files() cluster.start(wait_other_notice=True) # kill stream to node2 in the middle of streaming to let it fail if cluster.version() < '4.0': node1.byteman_submit([self.byteman_submit_path_pre_4_0]) else: node1.byteman_submit([self.byteman_submit_path_4_0]) node1.stress([ 'write', 'n=1K', 'no-warmup', 'cl=ONE', '-schema', 'replication(factor=3)', '-rate', 'threads=50', '-mode', 'native', 'cql3', 'user=cassandra', 'password=cassandra' ]) cluster.flush() # start bootstrapping node2 and wait for streaming node2 = new_node(cluster) node2.set_configuration_options(values=config) node2.byteman_port = '8101' # set for when we add node3 node2.import_config_files() node2.start(jvm_args=["-Dcassandra.ring_delay_ms=5000"], wait_other_notice=True) self.assert_log_had_msg(node2, 'Some data streaming failed', timeout=30) self.assert_log_had_msg( node2, 'Not starting client transports as bootstrap has not completed', timeout=30) try: node2.nodetool('join') pytest.fail('nodetool should have errored and failed to join ring') except ToolError as t: assert "Cannot join the ring until bootstrap completes" in t.stdout node2.nodetool('bootstrap resume') node2.wait_for_binary_interface() assert_bootstrap_state(self, node2, 'COMPLETED', user='******', password='******') # Test write survey behaviour node3 = new_node(cluster) node3.set_configuration_options(values=config) # kill stream to node3 in the middle of streaming to let it fail if cluster.version() < '4.0': node1.byteman_submit([self.byteman_submit_path_pre_4_0]) node2.byteman_submit([self.byteman_submit_path_pre_4_0]) else: node1.byteman_submit([self.byteman_submit_path_4_0]) node2.byteman_submit([self.byteman_submit_path_4_0]) node3.start(jvm_args=[ "-Dcassandra.write_survey=true", "-Dcassandra.ring_delay_ms=5000" ], wait_other_notice=True) self.assert_log_had_msg(node3, 'Some data streaming failed', timeout=30) self.assert_log_had_msg( node3, "Not starting client transports in write_survey mode as it's bootstrapping or auth is enabled", timeout=30) try: node3.nodetool('join') pytest.fail('nodetool should have errored and failed to join ring') except ToolError as t: assert "Cannot join the ring until bootstrap completes" in t.stdout node3.nodetool('bootstrap resume') self.assert_log_had_msg( node3, "Not starting client transports in write_survey mode as it's bootstrapping or auth is enabled", timeout=30) # Should succeed in joining node3.nodetool('join') self.assert_log_had_msg( node3, "Leaving write survey mode and joining ring at operator request", timeout=30) assert_bootstrap_state(self, node3, 'COMPLETED', user='******', password='******') node3.wait_for_binary_interface(timeout=30)
def test_bootstrap_binary_disabled(self): """ Test binary while bootstrapping and streaming fails. This test was ported to jvm-dtest org.apache.cassandra.distributed.test.BootstrapBinaryDisabledTest, as of this writing there are a few limitations with jvm-dtest which requries this test to stay, namely vnode support (ci also tests under different configs). Once jvm-dtest supports vnodes, this test can go away in favor of that class. @jira_ticket CASSANDRA-14526, CASSANDRA-14525, CASSANDRA-16127 """ config = {'authenticator': 'org.apache.cassandra.auth.PasswordAuthenticator', 'authorizer': 'org.apache.cassandra.auth.CassandraAuthorizer', 'role_manager': 'org.apache.cassandra.auth.CassandraRoleManager', 'permissions_validity_in_ms': 0, 'roles_validity_in_ms': 0} cluster = self.cluster cluster.set_environment_variable('CASSANDRA_TOKEN_PREGENERATION_DISABLED', 'True') cluster.populate(1) node1 = cluster.nodes['node1'] # set up byteman node1.byteman_port = '8100' node1.import_config_files() cluster.start() # kill stream to node2 in the middle of streaming to let it fail if cluster.version() < '4.0': node1.byteman_submit([self.byteman_submit_path_pre_4_0]) else: node1.byteman_submit([self.byteman_submit_path_4_0]) node1.stress(['write', 'n=1K', 'no-warmup', 'cl=ONE', '-schema', 'replication(factor=3)', '-rate', 'threads=50', '-mode', 'native', 'cql3', 'user=cassandra', 'password=cassandra']) cluster.flush() # start bootstrapping node2 and wait for streaming node2 = new_node(cluster) node2.set_configuration_options(values=config) node2.byteman_port = '8101' # set for when we add node3 node2.import_config_files() node2.start(jvm_args=["-Dcassandra.ring_delay_ms=5000"]) self.assert_log_had_msg(node2, 'Some data streaming failed') try: node2.nodetool('join') pytest.fail('nodetool should have errored and failed to join ring') except ToolError as t: assert "Cannot join the ring until bootstrap completes" in t.stdout node2.nodetool('bootstrap resume') node2.wait_for_binary_interface() assert_bootstrap_state(self, node2, 'COMPLETED', user='******', password='******') # Test write survey behaviour node3 = new_node(cluster) node3.set_configuration_options(values=config) # kill stream to node3 in the middle of streaming to let it fail if cluster.version() < '4.0': node1.byteman_submit([self.byteman_submit_path_pre_4_0]) node2.byteman_submit([self.byteman_submit_path_pre_4_0]) else: node1.byteman_submit([self.byteman_submit_path_4_0]) node2.byteman_submit([self.byteman_submit_path_4_0]) node3.start(jvm_args=["-Dcassandra.write_survey=true", "-Dcassandra.ring_delay_ms=5000"]) self.assert_log_had_msg(node3, 'Some data streaming failed') self.assert_log_had_msg(node3, "Not starting client transports in write_survey mode as it's bootstrapping or auth is enabled") try: node3.nodetool('join') pytest.fail('nodetool should have errored and failed to join ring') except ToolError as t: assert "Cannot join the ring until bootstrap completes" in t.stdout node3.nodetool('bootstrap resume') self.assert_log_had_msg(node3, "Not starting client transports in write_survey mode as it's bootstrapping or auth is enabled") # Should succeed in joining node3.nodetool('join') self.assert_log_had_msg(node3, "Leaving write survey mode and joining ring at operator request") assert_bootstrap_state(self, node3, 'COMPLETED', user='******', password='******') node3.wait_for_binary_interface()