def test_gc(self): """ Test that tombstone purging doesn't bring back deleted data by writing 2 rows to a table with gc_grace=0, deleting one of those rows, then asserting that it isn't present in the results of SELECT *, before and after a flush and compaction. """ cluster = self.cluster cluster.populate(1).start() [node1] = cluster.nodelist() time.sleep(.5) session = self.patient_cql_connection(node1) create_ks(session, 'ks', 1) create_cf(session, 'cf', gc_grace=0, key_type='int', columns={'c1': 'int'}) session.execute('insert into cf (key, c1) values (1,1)') session.execute('insert into cf (key, c1) values (2,1)') node1.flush() assert rows_to_list(session.execute('select * from cf;')) == [[1, 1], [2, 1]] session.execute('delete from cf where key=1') assert rows_to_list(session.execute('select * from cf;')) == [[2, 1]] node1.flush() time.sleep(.5) node1.compact() time.sleep(.5) assert rows_to_list(session.execute('select * from cf;')) == [[2, 1]]
def _token_gen_test(self, nodes, randomPart=None): generated_tokens, session = self.prepare(randomPart, nodes=nodes) dc_tokens = generated_tokens[0] tokens = [] local_tokens = rows_to_list( session.execute("SELECT tokens FROM system.local"))[0] self.assertEqual(local_tokens.__len__(), 1, "too many tokens for peer") for tok in local_tokens: tokens += tok rows = rows_to_list(session.execute("SELECT tokens FROM system.peers")) self.assertEqual(rows.__len__(), nodes - 1) for row in rows: peer_tokens = row[0] self.assertEqual(peer_tokens.__len__(), 1, "too many tokens for peer") for tok in peer_tokens: tokens.append(tok) self.assertEqual(tokens.__len__(), dc_tokens.__len__()) for cluster_token in tokens: tok = int(cluster_token) self.assertGreaterEqual( dc_tokens.index(tok), 0, "token in cluster does not match generated tokens")
def query_user(self, session, userid, age, consistency, check_ret=True): statement = SimpleStatement("SELECT userid, age FROM users where userid = {}".format(userid), consistency_level=consistency) res = session.execute(statement) expected = [[userid, age]] if age else [] ret = rows_to_list(res) == expected if check_ret: self.assertTrue(ret, "Got {} from {}, expected {} at {}".format(rows_to_list(res), session.cluster.contact_points, expected, consistency_value_to_name(consistency))) return ret
def test_commitlog_replay_on_startup(self): """ Test commit log replay """ node1 = self.node1 node1.set_batch_commitlog(enabled=True) node1.start() debug("Insert data") session = self.patient_cql_connection(node1) create_ks(session, 'Test', 1) session.execute(""" CREATE TABLE users ( user_name varchar PRIMARY KEY, password varchar, gender varchar, state varchar, birth_year bigint ); """) session.execute("INSERT INTO Test. users (user_name, password, gender, state, birth_year) " "VALUES('gandalf', 'p@$$', 'male', 'WA', 1955);") debug("Verify data is present") session = self.patient_cql_connection(node1) res = session.execute("SELECT * FROM Test. users") self.assertItemsEqual(rows_to_list(res), [[u'gandalf', 1955, u'male', u'p@$$', u'WA']]) debug("Stop node abruptly") node1.stop(gently=False) debug("Verify commitlog was written before abrupt stop") commitlog_dir = os.path.join(node1.get_path(), 'commitlogs') commitlog_files = os.listdir(commitlog_dir) self.assertTrue(len(commitlog_files) > 0) debug("Verify no SSTables were flushed before abrupt stop") self.assertEqual(0, len(node1.get_sstables('test', 'users'))) debug("Verify commit log was replayed on startup") node1.start() node1.watch_log_for("Log replay complete") # Here we verify from the logs that some mutations were replayed replays = [match_tuple[0] for match_tuple in node1.grep_log(" \d+ replayed mutations")] debug('The following log lines indicate that mutations were replayed: {msgs}'.format(msgs=replays)) num_replayed_mutations = [ parse('{} {num_mutations:d} replayed mutations{}', line).named['num_mutations'] for line in replays ] # assert there were some lines where more than zero mutations were replayed self.assertNotEqual([m for m in num_replayed_mutations if m > 0], []) debug("Make query and ensure data is present") session = self.patient_cql_connection(node1) res = session.execute("SELECT * FROM Test. users") self.assertItemsEqual(rows_to_list(res), [[u'gandalf', 1955, u'male', u'p@$$', u'WA']])
def test_commitlog_replay_on_startup(self): """ Test commit log replay """ node1 = self.node1 node1.set_batch_commitlog(enabled=True) node1.start() logger.debug("Insert data") session = self.patient_cql_connection(node1) create_ks(session, 'Test', 1) session.execute(""" CREATE TABLE users ( user_name varchar PRIMARY KEY, password varchar, gender varchar, state varchar, birth_year bigint ); """) session.execute("INSERT INTO Test. users (user_name, password, gender, state, birth_year) " "VALUES('gandalf', 'p@$$', 'male', 'WA', 1955);") logger.debug("Verify data is present") session = self.patient_cql_connection(node1) res = session.execute("SELECT * FROM Test. users") assert rows_to_list(res) == [['gandalf', 1955, 'male', 'p@$$', 'WA']] logger.debug("Stop node abruptly") node1.stop(gently=False) logger.debug("Verify commitlog was written before abrupt stop") commitlog_dir = os.path.join(node1.get_path(), 'commitlogs') commitlog_files = os.listdir(commitlog_dir) assert len(commitlog_files) > 0 logger.debug("Verify no SSTables were flushed before abrupt stop") assert 0 == len(node1.get_sstables('test', 'users')) logger.debug("Verify commit log was replayed on startup") node1.start() node1.watch_log_for("Log replay complete") # Here we verify from the logs that some mutations were replayed replays = [match_tuple[0] for match_tuple in node1.grep_log(r" \d+ replayed mutations")] logger.debug('The following log lines indicate that mutations were replayed: {msgs}'.format(msgs=replays)) num_replayed_mutations = [ parse('{} {num_mutations:d} replayed mutations{}', line).named['num_mutations'] for line in replays ] # assert there were some lines where more than zero mutations were replayed assert [m for m in num_replayed_mutations if m > 0] != [] logger.debug("Make query and ensure data is present") session = self.patient_cql_connection(node1) res = session.execute("SELECT * FROM Test. users") assert_lists_equal_ignoring_order(rows_to_list(res), [['gandalf', 1955, 'male', 'p@$$', 'WA']])
def distribution_template(self, ratio_spec, expected_ratio, delta): """ @param ratio_spec the string passed to `row-population-ratio` in the call to `cassandra-stress` @param expected_ratio the expected ratio of null/non-null values in the values written @param delta the acceptable delta between the expected and actual ratios A parameterized test for the `row-population-ratio` parameter to `cassandra-stress`. """ self.cluster.populate(1).start(wait_for_binary_proto=True) node = self.cluster.nodelist()[0] node.stress([ 'write', 'n=1000', 'no-warmup', '-rate', 'threads=50', '-col', 'n=FIXED(50)', '-insert', 'row-population-ratio={ratio_spec}'.format(ratio_spec=ratio_spec) ]) session = self.patient_cql_connection(node) written = rows_to_list( session.execute('SELECT * FROM keyspace1.standard1;')) num_nones = sum(row.count(None) for row in written) num_results = sum(len(row) for row in written) self.assertAlmostEqual(float(num_nones) / num_results, expected_ratio, delta=delta)
def test_compact_counter_cluster(self): """ @jira_ticket CASSANDRA-12219 This test will fail on 3.0.0 - 3.0.8, and 3.1 - 3.8 """ cluster = self.cluster cluster.populate(3).start() node1 = cluster.nodelist()[0] session = self.patient_cql_connection(node1) create_ks(session, 'counter_tests', 1) session.execute(""" CREATE TABLE IF NOT EXISTS counter_cs ( key bigint PRIMARY KEY, data counter ) WITH COMPACT STORAGE """) for outer in range(0, 5): for idx in range(0, 5): session.execute( "UPDATE counter_cs SET data = data + 1 WHERE key = {k}". format(k=idx)) for idx in range(0, 5): row = list( session.execute( "SELECT data from counter_cs where key = {k}".format( k=idx))) assert rows_to_list(row)[0][0] == 5
def check_data_on_each_replica(self, expect_fully_repaired, initial_replica): """ Perform a SELECT * query at CL.ONE on each replica in turn. If expect_fully_repaired is True, we verify that each replica returns the full row being queried. If not, then we only verify that the 'a' column has been repaired. """ stmt = SimpleStatement("SELECT * FROM alter_rf_test.t1 WHERE k=1", consistency_level=ConsistencyLevel.ONE) logger.debug( "Checking all if read repair has completed on all replicas") for n in self.cluster.nodelist(): logger.debug("Checking {n}, {x}expecting all columns".format( n=n.name, x="" if expect_fully_repaired or n == initial_replica else "not ")) session = self.patient_exclusive_cql_connection(n) res = rows_to_list(session.execute(stmt)) logger.debug("Actual result: " + str(res)) expected = [[ 1, 1, 1 ]] if expect_fully_repaired or n == initial_replica else [[ 1, 1, None ]] if res != expected: raise NotRepairedException()
def compact_counter_cluster_test(self): """ @jira_ticket CASSANDRA-12219 This test will fail on 3.0.0 - 3.0.8, and 3.1 - 3.8 """ cluster = self.cluster cluster.populate(3).start() node1 = cluster.nodelist()[0] session = self.patient_cql_connection(node1) create_ks(session, 'counter_tests', 1) session.execute(""" CREATE TABLE IF NOT EXISTS counter_cs ( key bigint PRIMARY KEY, data counter ) WITH COMPACT STORAGE """) for outer in range(0, 5): for idx in range(0, 5): session.execute("UPDATE counter_cs SET data = data + 1 WHERE key = {k}".format(k=idx)) for idx in range(0, 5): row = list(session.execute("SELECT data from counter_cs where key = {k}".format(k=idx))) self.assertEqual(rows_to_list(row)[0][0], 5)
def query_counter(self, session, id, val, consistency, check_ret=True): statement = SimpleStatement("SELECT * from counters WHERE id = {}".format(id), consistency_level=consistency) ret = rows_to_list(session.execute(statement)) if check_ret: self.assertEqual(ret[0][1], val, "Got {} from {}, expected {} at {}".format(ret[0][1], session.cluster.contact_points, val, consistency_value_to_name(consistency))) return ret[0][1] if ret else 0
def _fetch_initial_data(self, table='keyspace1.standard1', cl=ConsistencyLevel.THREE, limit=10000): debug("Fetching initial data from {} on {} with CL={} and LIMIT={}". format(table, self.query_node.name, cl, limit)) session = self.patient_cql_connection(self.query_node) query = SimpleStatement('select * from {} LIMIT {}'.format( table, limit), consistency_level=cl) return rows_to_list(session.execute(query))
def _insert_rows(session, table_name, insert_stmt, values): prepared_insert = session.prepare(insert_stmt) values = list(values) # in case values is a generator execute_concurrent(session, ((prepared_insert, x) for x in values), concurrency=500, raise_on_first_error=True) data_loaded = rows_to_list(session.execute('SELECT * FROM ' + table_name)) logger.debug('{n} rows inserted into {table_name}'.format(n=len(data_loaded), table_name=table_name)) # use assert_equal over assert_length_equal to avoid printing out # potentially large lists assert len(values) == len(data_loaded) return data_loaded
def _token_gen_test(self, nodes, randomPart=None): generated_tokens, session = self.prepare(randomPart, nodes=nodes) dc_tokens = generated_tokens[0] tokens = [] local_tokens = rows_to_list(session.execute("SELECT tokens FROM system.local"))[0] self.assertEqual(local_tokens.__len__(), 1, "too many tokens for peer") for tok in local_tokens: tokens += tok rows = rows_to_list(session.execute("SELECT tokens FROM system.peers")) self.assertEqual(rows.__len__(), nodes - 1) for row in rows: peer_tokens = row[0] self.assertEqual(peer_tokens.__len__(), 1, "too many tokens for peer") for tok in peer_tokens: tokens.append(tok) self.assertEqual(tokens.__len__(), dc_tokens.__len__()) for cluster_token in tokens: tok = int(cluster_token) self.assertGreaterEqual(dc_tokens.index(tok), 0, "token in cluster does not match generated tokens")
def _token_gen_test(self, nodes, randomPart=None): generated_tokens, session = self.prepare(randomPart, nodes=nodes) dc_tokens = generated_tokens[0] tokens = [] local_tokens = rows_to_list(session.execute("SELECT tokens FROM system.local"))[0] assert local_tokens.__len__(), 1 == "too many tokens for peer" for tok in local_tokens: tokens += tok rows = rows_to_list(session.execute("SELECT tokens FROM system.peers")) assert rows.__len__() == nodes - 1 for row in rows: peer_tokens = row[0] assert peer_tokens.__len__(), 1 == "too many tokens for peer" for tok in peer_tokens: tokens.append(tok) assert tokens.__len__() == dc_tokens.__len__() for cluster_token in tokens: tok = int(cluster_token) assert dc_tokens.index(tok), 0 >= "token in cluster does not match generated tokens"
def test_query_indexes_with_vnodes(self): """ Verifies correct query behaviour in the presence of vnodes @jira_ticket CASSANDRA-11104 """ cluster = self.cluster cluster.populate(2).start() node1, node2 = cluster.nodelist() session = self.patient_cql_connection(node1) session.execute("CREATE KEYSPACE ks WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': '1'};") session.execute("CREATE TABLE ks.compact_table (a int PRIMARY KEY, b int) WITH COMPACT STORAGE;") session.execute("CREATE INDEX keys_index ON ks.compact_table (b);") session.execute("CREATE TABLE ks.regular_table (a int PRIMARY KEY, b int)") session.execute("CREATE INDEX composites_index on ks.regular_table (b)") for node in cluster.nodelist(): start = time.time() while time.time() < start + 10: debug("waiting for index to build") time.sleep(1) if index_is_built(node, session, 'ks', 'regular_table', 'composites_index'): break else: raise DtestTimeoutError() insert_args = [(i, i % 2) for i in xrange(100)] execute_concurrent_with_args(session, session.prepare("INSERT INTO ks.compact_table (a, b) VALUES (?, ?)"), insert_args) execute_concurrent_with_args(session, session.prepare("INSERT INTO ks.regular_table (a, b) VALUES (?, ?)"), insert_args) res = session.execute("SELECT * FROM ks.compact_table WHERE b = 0") self.assertEqual(len(rows_to_list(res)), 50) res = session.execute("SELECT * FROM ks.regular_table WHERE b = 0") self.assertEqual(len(rows_to_list(res)), 50)
def check_data_on_each_replica(self, expect_fully_repaired, initial_replica): """ Perform a SELECT * query at CL.ONE on each replica in turn. If expect_fully_repaired is True, we verify that each replica returns the full row being queried. If not, then we only verify that the 'a' column has been repaired. """ stmt = SimpleStatement("SELECT * FROM alter_rf_test.t1 WHERE k=1", consistency_level=ConsistencyLevel.ONE) logger.debug("Checking all if read repair has completed on all replicas") for n in self.cluster.nodelist(): logger.debug("Checking {n}, {x}expecting all columns" .format(n=n.name, x="" if expect_fully_repaired or n == initial_replica else "not ")) session = self.patient_exclusive_cql_connection(n) res = rows_to_list(session.execute(stmt)) logger.debug("Actual result: " + str(res)) expected = [[1, 1, 1]] if expect_fully_repaired or n == initial_replica else [[1, 1, None]] if res != expected: raise NotRepairedException()
def drop_counter_column_test(self): """Test for CASSANDRA-7831""" cluster = self.cluster cluster.populate(1).start() node1, = cluster.nodelist() session = self.patient_cql_connection(node1) self.create_ks(session, 'counter_tests', 1) session.execute("CREATE TABLE counter_bug (t int, c counter, primary key(t))") session.execute("UPDATE counter_bug SET c = c + 1 where t = 1") row = list(session.execute("SELECT * from counter_bug")) self.assertEqual(rows_to_list(row)[0], [1, 1]) self.assertEqual(len(row), 1) session.execute("ALTER TABLE counter_bug drop c") assert_invalid(session, "ALTER TABLE counter_bug add c counter", "Cannot re-add previously dropped counter column c")
def drop_counter_column_test(self): """Test for CASSANDRA-7831""" cluster = self.cluster cluster.populate(1).start() node1, = cluster.nodelist() session = self.patient_cql_connection(node1) create_ks(session, 'counter_tests', 1) session.execute("CREATE TABLE counter_bug (t int, c counter, primary key(t))") session.execute("UPDATE counter_bug SET c = c + 1 where t = 1") row = list(session.execute("SELECT * from counter_bug")) self.assertEqual(rows_to_list(row)[0], [1, 1]) self.assertEqual(len(row), 1) session.execute("ALTER TABLE counter_bug drop c") assert_invalid(session, "ALTER TABLE counter_bug add c counter", "Cannot re-add previously dropped counter column c")
def distribution_template(self, ratio_spec, expected_ratio, delta): """ @param ratio_spec the string passed to `row-population-ratio` in the call to `cassandra-stress` @param expected_ratio the expected ratio of null/non-null values in the values written @param delta the acceptable delta between the expected and actual ratios A parameterized test for the `row-population-ratio` parameter to `cassandra-stress`. """ self.cluster.populate(1).start(wait_for_binary_proto=True) node = self.cluster.nodelist()[0] node.stress(['write', 'n=1000', 'no-warmup', '-rate', 'threads=50', '-col', 'n=FIXED(50)', '-insert', 'row-population-ratio={ratio_spec}'.format(ratio_spec=ratio_spec)]) session = self.patient_cql_connection(node) written = rows_to_list(session.execute('SELECT * FROM keyspace1.standard1;')) num_nones = sum(row.count(None) for row in written) num_results = sum(len(row) for row in written) self.assertAlmostEqual(float(num_nones) / num_results, expected_ratio, delta=delta)
def test_cdc_data_available_in_cdc_raw(self): ks_name = 'ks' # First, create a new node just for data generation. generation_node, generation_session = self.prepare(ks_name=ks_name) cdc_table_info = TableInfo( ks_name=ks_name, table_name='cdc_tab', column_spec=_16_uuid_column_spec, insert_stmt=_get_16_uuid_insert_stmt(ks_name, 'cdc_tab'), options={ 'cdc': 'true', # give table an explicit id so when we create it again it's the # same table and we can replay into it 'id': uuid.uuid4() } ) # Write until we get a new CL segment to avoid replaying initialization # mutations from this node's startup into system tables in the other # node. See CASSANDRA-11811. advance_to_next_cl_segment( session=generation_session, commitlog_dir=os.path.join(generation_node.get_path(), 'commitlogs') ) generation_session.execute(cdc_table_info.create_stmt) # insert 10000 rows inserted_rows = _insert_rows(generation_session, cdc_table_info.name, cdc_table_info.insert_stmt, repeat((), 10000)) # drain the node to guarantee all cl segments will be recycled logger.debug('draining') generation_node.drain() logger.debug('stopping') # stop the node and clean up all sessions attached to it generation_session.cluster.shutdown() generation_node.stop() # We can rely on the existing _cdc.idx files to determine which .log files contain cdc data. source_path = os.path.join(generation_node.get_path(), 'cdc_raw') source_cdc_indexes = {ReplayData.load(source_path, name) for name in source_path if name.endswith('_cdc.idx')} # assertNotEqual(source_cdc_indexes, {}) assert source_cdc_indexes != {} # create a new node to use for cdc_raw cl segment replay loading_node = self._init_new_loading_node(ks_name, cdc_table_info.create_stmt, self.cluster.version() < '4') # move cdc_raw contents to commitlog directories, then start the # node again to trigger commitlog replay, which should replay the # cdc_raw files we moved to commitlogs into memtables. logger.debug('moving cdc_raw and restarting node') _move_commitlog_segments( os.path.join(generation_node.get_path(), 'cdc_raw'), os.path.join(loading_node.get_path(), 'commitlogs') ) loading_node.start(wait_for_binary_proto=True) logger.debug('node successfully started; waiting on log replay') loading_node.grep_log('Log replay complete') logger.debug('log replay complete') # final assertions validation_session = self.patient_exclusive_cql_connection(loading_node) data_in_cdc_table_after_restart = rows_to_list( validation_session.execute('SELECT * FROM ' + cdc_table_info.name) ) logger.debug('found {cdc} values in CDC table'.format( cdc=len(data_in_cdc_table_after_restart) )) # Then we assert that the CDC data that we expect to be there is there. # All data that was in CDC tables should have been copied to cdc_raw, # then used in commitlog replay, so it should be back in the cluster. assert (inserted_rows == data_in_cdc_table_after_restart), 'not all expected data selected' if self.cluster.version() >= '4.0': # Create ReplayData objects for each index file found in loading cluster loading_path = os.path.join(loading_node.get_path(), 'cdc_raw') dest_cdc_indexes = [ReplayData.load(loading_path, name) for name in os.listdir(loading_path) if name.endswith('_cdc.idx')] # Compare source replay data to dest to ensure replay process created both hard links and index files. for srd in source_cdc_indexes: # Confirm both log and index are in dest assert os.path.isfile(os.path.join(loading_path, srd.idx_name)) assert os.path.isfile(os.path.join(loading_path, srd.log_name)) # Find dest ReplayData that corresponds to the source (should be exactly 1) corresponding_dest_replay_datae = [x for x in dest_cdc_indexes if srd.idx_name == x.idx_name] assert_length_equal(corresponding_dest_replay_datae, 1) drd = corresponding_dest_replay_datae[0] # We can't compare equality on offsets since replay uses the raw file length as the written # cdc offset. We *can*, however, confirm that the offset in the replayed file is >= # the source file, ensuring clients are signaled to replay at least all the data in the # log. assert drd.offset >= srd.offset # Confirm completed flag is the same in both assert srd.completed == drd.completed # Confirm that the relationship between index files on the source # and destination looks like we expect. # First, grab the mapping between the two, make sure it's a 1-1 # mapping, and transform the dict to reflect that: src_to_dest_idx_map = { src_rd: [dest_rd for dest_rd in dest_cdc_indexes if dest_rd.idx_name == src_rd.idx_name] for src_rd in source_cdc_indexes } for src_rd, dest_rds in src_to_dest_idx_map.items(): assert_length_equal(dest_rds, 1) src_to_dest_idx_map[src_rd] = dest_rds[0] # All offsets in idx files that were copied should be >0 on the # destination node. assert ( 0 not in {i.offset for i in src_to_dest_idx_map.values()}),\ ('Found index offsets == 0 in an index file on the ' 'destination node that corresponds to an index file on the ' 'source node:\n' '{}').format(pformat(src_to_dest_idx_map)) # Offsets of all shared indexes should be >= on the destination # than on the source. for src_rd, dest_rd in src_to_dest_idx_map.items(): assert dest_rd.offset >= src_rd.offset src_to_dest_idx_map = { src_rd: [dest_rd for dest_rd in dest_cdc_indexes if dest_rd.idx_name == src_rd.idx_name] for src_rd in source_cdc_indexes } for k, v in src_to_dest_idx_map.items(): assert_length_equal(v, 1) assert k.offset >= v.offset
def read_as_list(self, query, session=None, node=None): session = session or self.exclusive_cql_connection(node or self.node1) return rows_to_list(self.quorum(session, query))
def test_json_tools(self): logger.debug("Starting cluster...") cluster = self.cluster cluster.set_batch_commitlog(enabled=True) cluster.populate(1).start() logger.debug("Version: " + cluster.version().vstring) logger.debug("Getting CQLSH...") [node1] = cluster.nodelist() session = self.patient_cql_connection(node1) logger.debug("Inserting data...") create_ks(session, 'Test', 1) session.execute(""" CREATE TABLE users ( user_name varchar PRIMARY KEY, password varchar, gender varchar, state varchar, birth_year bigint ); """) session.execute( "INSERT INTO Test. users (user_name, password, gender, state, birth_year) " "VALUES ('frodo', 'pass@', 'male', 'CA', 1985);") session.execute( "INSERT INTO Test. users (user_name, password, gender, state, birth_year) " "VALUES ('sam', '@pass', 'male', 'NY', 1980);") res = session.execute("SELECT * FROM Test. users") assert_lists_equal_ignoring_order( rows_to_list(res), [['frodo', 1985, 'male', 'pass@', 'CA'], ['sam', 1980, 'male', '@pass', 'NY']]) logger.debug("Flushing and stopping cluster...") node1.flush() cluster.stop() logger.debug("Exporting to JSON file...") json_path = tempfile.mktemp(suffix='.schema.json') with open(json_path, 'w') as f: node1.run_sstable2json(f) with open(json_path, 'r') as fin: data = fin.read().splitlines(True) if data[0][0] == 'W': with open(json_path, 'w') as fout: fout.writelines(data[1:]) logger.debug("Deleting cluster and creating new...") cluster.clear() cluster.start() logger.debug("Inserting data...") session = self.patient_cql_connection(node1) create_ks(session, 'Test', 1) session.execute(""" CREATE TABLE users ( user_name varchar PRIMARY KEY, password varchar, gender varchar, state varchar, birth_year bigint ); """) session.execute( "INSERT INTO Test. users (user_name, password, gender, state, birth_year) " "VALUES ('gandalf', 'p@$$', 'male', 'WA', 1955);") node1.flush() cluster.stop() logger.debug("Importing JSON file...") with open(json_path) as f: node1.run_json2sstable(f, "test", "users") os.remove(json_path) logger.debug("Verifying import...") cluster.start() [node1] = cluster.nodelist() session = self.patient_cql_connection(node1) res = session.execute("SELECT * FROM Test. users") logger.debug("data: " + str(res)) assert_lists_equal_ignoring_order( rows_to_list(res), [['frodo', 1985, 'male', 'pass@', 'CA'], ['sam', 1980, 'male', '@pass', 'NY'], ['gandalf', 1955, 'male', 'p@$$', 'WA']])
def alter_rf_and_run_read_repair_test(self): """ @jira_ticket CASSANDRA-10655 @jira_ticket CASSANDRA-10657 Test that querying only a subset of all the columns in a row doesn't confuse read-repair to avoid the problem described in CASSANDRA-10655. """ session = self.patient_cql_connection(self.cluster.nodelist()[0]) session.execute("""CREATE KEYSPACE alter_rf_test WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1};""") session.execute("CREATE TABLE alter_rf_test.t1 (k int PRIMARY KEY, a int, b int);") session.execute("INSERT INTO alter_rf_test.t1 (k, a, b) VALUES (1, 1, 1);") cl_one_stmt = SimpleStatement("SELECT * FROM alter_rf_test.t1 WHERE k=1", consistency_level=ConsistencyLevel.ONE) # identify the initial replica and trigger a flush to ensure reads come from sstables initial_replica, non_replicas = self.identify_initial_placement('alter_rf_test', 't1', 1) debug("At RF=1 replica for data is " + initial_replica.name) initial_replica.flush() # At RF=1, it shouldn't matter which node we query, as the actual data should always come from the # initial replica when reading at CL ONE for n in self.cluster.nodelist(): debug("Checking " + n.name) session = self.patient_exclusive_cql_connection(n) assert_one(session, "SELECT * FROM alter_rf_test.t1 WHERE k=1", [1, 1, 1], cl=ConsistencyLevel.ONE) # Alter so RF=n but don't repair, then execute a query which selects only a subset of the columns. Run this at # CL ALL on one of the nodes which doesn't currently have the data, triggering a read repair. # The expectation will be that every replicas will have been repaired for that column (but we make no assumptions # on the other columns). debug("Changing RF from 1 to 3") session.execute("""ALTER KEYSPACE alter_rf_test WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 3};""") cl_all_stmt = SimpleStatement("SELECT a FROM alter_rf_test.t1 WHERE k=1", consistency_level=ConsistencyLevel.ALL) debug("Executing SELECT on non-initial replica to trigger read repair " + non_replicas[0].name) read_repair_session = self.patient_exclusive_cql_connection(non_replicas[0]) # result of the CL ALL query contains only the selected column assert_one(read_repair_session, "SELECT a FROM alter_rf_test.t1 WHERE k=1", [1], cl=ConsistencyLevel.ALL) # Check the results of the read repair by querying each replica again at CL ONE debug("Re-running SELECTs at CL ONE to verify read repair") for n in self.cluster.nodelist(): debug("Checking " + n.name) session = self.patient_exclusive_cql_connection(n) res = rows_to_list(session.execute(cl_one_stmt)) # Column a must be 1 everywhere, and column b must be either 1 or None everywhere self.assertIn(res[0][:2], [[1, 1], [1, None]]) # Now query at ALL but selecting all columns query = "SELECT * FROM alter_rf_test.t1 WHERE k=1" debug("Executing SELECT on non-initial replica to trigger read repair " + non_replicas[0].name) read_repair_session = self.patient_exclusive_cql_connection(non_replicas[0]) assert_one(session, query, [1, 1, 1], cl=ConsistencyLevel.ALL) # Check all replica is fully up to date debug("Re-running SELECTs at CL ONE to verify read repair") for n in self.cluster.nodelist(): debug("Checking " + n.name) session = self.patient_exclusive_cql_connection(n) assert_one(session, query, [1, 1, 1], cl=ConsistencyLevel.ONE)
def _fetch_initial_data(self, table='keyspace1.standard1', cl=ConsistencyLevel.THREE, limit=10000): logger.debug("Fetching initial data from {} on {} with CL={} and LIMIT={}".format(table, self.query_node.name, cl, limit)) session = self.patient_cql_connection(self.query_node) query = SimpleStatement('select * from {} LIMIT {}'.format(table, limit), consistency_level=cl) return rows_to_list(session.execute(query, timeout=20))
def test_cdc_data_available_in_cdc_raw(self): ks_name = 'ks' # First, create a new node just for data generation. generation_node, generation_session = self.prepare(ks_name=ks_name) cdc_table_info = TableInfo( ks_name=ks_name, table_name='cdc_tab', column_spec=_16_uuid_column_spec, insert_stmt=_get_16_uuid_insert_stmt(ks_name, 'cdc_tab'), options={ 'cdc': 'true', # give table an explicit id so when we create it again it's the # same table and we can replay into it 'id': uuid.uuid4() }) # Write until we get a new CL segment to avoid replaying initialization # mutations from this node's startup into system tables in the other # node. See CASSANDRA-11811. advance_to_next_cl_segment(session=generation_session, commitlog_dir=os.path.join( generation_node.get_path(), 'commitlogs')) generation_session.execute(cdc_table_info.create_stmt) # insert 10000 rows inserted_rows = _insert_rows(generation_session, cdc_table_info.name, cdc_table_info.insert_stmt, repeat((), 10000)) # drain the node to guarantee all cl segements will be recycled debug('draining') generation_node.drain() debug('stopping') # stop the node and clean up all sessions attached to it generation_node.stop() generation_session.cluster.shutdown() # create a new node to use for cdc_raw cl segment replay loading_node = self._init_new_loading_node( ks_name, cdc_table_info.create_stmt, self.cluster.version() < '4') # move cdc_raw contents to commitlog directories, then start the # node again to trigger commitlog replay, which should replay the # cdc_raw files we moved to commitlogs into memtables. debug('moving cdc_raw and restarting node') _move_contents(os.path.join(generation_node.get_path(), 'cdc_raw'), os.path.join(loading_node.get_path(), 'commitlogs')) loading_node.start(wait_for_binary_proto=True) debug('node successfully started; waiting on log replay') loading_node.grep_log('Log replay complete') debug('log replay complete') # final assertions validation_session = self.patient_exclusive_cql_connection( loading_node) data_in_cdc_table_after_restart = rows_to_list( validation_session.execute('SELECT * FROM ' + cdc_table_info.name)) debug('found {cdc} values in CDC table'.format( cdc=len(data_in_cdc_table_after_restart))) # Then we assert that the CDC data that we expect to be there is there. # All data that was in CDC tables should have been copied to cdc_raw, # then used in commitlog replay, so it should be back in the cluster. self.assertEqual( inserted_rows, data_in_cdc_table_after_restart, # The message on failure is too long, since cdc_data is thousands # of items, so we print something else here msg='not all expected data selected')
def alter_rf_and_run_read_repair_test(self): """ @jira_ticket CASSANDRA-10655 @jira_ticket CASSANDRA-10657 Test that querying only a subset of all the columns in a row doesn't confuse read-repair to avoid the problem described in CASSANDRA-10655. """ session = self.patient_cql_connection(self.cluster.nodelist()[0]) session.execute("""CREATE KEYSPACE alter_rf_test WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1};""" ) session.execute( "CREATE TABLE alter_rf_test.t1 (k int PRIMARY KEY, a int, b int);") session.execute( "INSERT INTO alter_rf_test.t1 (k, a, b) VALUES (1, 1, 1);") cl_one_stmt = SimpleStatement( "SELECT * FROM alter_rf_test.t1 WHERE k=1", consistency_level=ConsistencyLevel.ONE) # identify the initial replica and trigger a flush to ensure reads come from sstables initial_replica, non_replicas = self.identify_initial_placement( 'alter_rf_test', 't1', 1) debug("At RF=1 replica for data is " + initial_replica.name) initial_replica.flush() # At RF=1, it shouldn't matter which node we query, as the actual data should always come from the # initial replica when reading at CL ONE for n in self.cluster.nodelist(): debug("Checking " + n.name) session = self.patient_exclusive_cql_connection(n) assert_one(session, "SELECT * FROM alter_rf_test.t1 WHERE k=1", [1, 1, 1], cl=ConsistencyLevel.ONE) # Alter so RF=n but don't repair, then execute a query which selects only a subset of the columns. Run this at # CL ALL on one of the nodes which doesn't currently have the data, triggering a read repair. # The expectation will be that every replicas will have been repaired for that column (but we make no assumptions # on the other columns). debug("Changing RF from 1 to 3") session.execute("""ALTER KEYSPACE alter_rf_test WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 3};""" ) cl_all_stmt = SimpleStatement( "SELECT a FROM alter_rf_test.t1 WHERE k=1", consistency_level=ConsistencyLevel.ALL) debug( "Executing SELECT on non-initial replica to trigger read repair " + non_replicas[0].name) read_repair_session = self.patient_exclusive_cql_connection( non_replicas[0]) # result of the CL ALL query contains only the selected column assert_one(read_repair_session, "SELECT a FROM alter_rf_test.t1 WHERE k=1", [1], cl=ConsistencyLevel.ALL) # Check the results of the read repair by querying each replica again at CL ONE debug("Re-running SELECTs at CL ONE to verify read repair") for n in self.cluster.nodelist(): debug("Checking " + n.name) session = self.patient_exclusive_cql_connection(n) res = rows_to_list(session.execute(cl_one_stmt)) # Column a must be 1 everywhere, and column b must be either 1 or None everywhere self.assertIn(res[0][:2], [[1, 1], [1, None]]) # Now query at ALL but selecting all columns query = "SELECT * FROM alter_rf_test.t1 WHERE k=1" debug( "Executing SELECT on non-initial replica to trigger read repair " + non_replicas[0].name) read_repair_session = self.patient_exclusive_cql_connection( non_replicas[0]) assert_one(session, query, [1, 1, 1], cl=ConsistencyLevel.ALL) # Check all replica is fully up to date debug("Re-running SELECTs at CL ONE to verify read repair") for n in self.cluster.nodelist(): debug("Checking " + n.name) session = self.patient_exclusive_cql_connection(n) assert_one(session, query, [1, 1, 1], cl=ConsistencyLevel.ONE)
def test_cdc_data_available_in_cdc_raw(self): ks_name = 'ks' # First, create a new node just for data generation. generation_node, generation_session = self.prepare(ks_name=ks_name) cdc_table_info = TableInfo( ks_name=ks_name, table_name='cdc_tab', column_spec=_16_uuid_column_spec, insert_stmt=_get_16_uuid_insert_stmt(ks_name, 'cdc_tab'), options={ 'cdc': 'true', # give table an explicit id so when we create it again it's the # same table and we can replay into it 'id': uuid.uuid4() }) # Write until we get a new CL segment to avoid replaying initialization # mutations from this node's startup into system tables in the other # node. See CASSANDRA-11811. advance_to_next_cl_segment(session=generation_session, commitlog_dir=os.path.join( generation_node.get_path(), 'commitlogs')) generation_session.execute(cdc_table_info.create_stmt) # insert 10000 rows inserted_rows = _insert_rows(generation_session, cdc_table_info.name, cdc_table_info.insert_stmt, repeat((), 10000)) # drain the node to guarantee all cl segments will be recycled logger.debug('draining') generation_node.drain() logger.debug('stopping') # stop the node and clean up all sessions attached to it generation_session.cluster.shutdown() generation_node.stop() # We can rely on the existing _cdc.idx files to determine which .log files contain cdc data. source_path = os.path.join(generation_node.get_path(), 'cdc_raw') source_cdc_indexes = { ReplayData.load(source_path, name) for name in source_path if name.endswith('_cdc.idx') } # assertNotEqual(source_cdc_indexes, {}) assert source_cdc_indexes != {} # create a new node to use for cdc_raw cl segment replay loading_node = self._init_new_loading_node( ks_name, cdc_table_info.create_stmt, self.cluster.version() < '4') # move cdc_raw contents to commitlog directories, then start the # node again to trigger commitlog replay, which should replay the # cdc_raw files we moved to commitlogs into memtables. logger.debug('moving cdc_raw and restarting node') _move_commitlog_segments( os.path.join(generation_node.get_path(), 'cdc_raw'), os.path.join(loading_node.get_path(), 'commitlogs')) loading_node.start(wait_for_binary_proto=True) logger.debug('node successfully started; waiting on log replay') loading_node.grep_log('Log replay complete') logger.debug('log replay complete') # final assertions validation_session = self.patient_exclusive_cql_connection( loading_node) data_in_cdc_table_after_restart = rows_to_list( validation_session.execute('SELECT * FROM ' + cdc_table_info.name)) logger.debug('found {cdc} values in CDC table'.format( cdc=len(data_in_cdc_table_after_restart))) # Then we assert that the CDC data that we expect to be there is there. # All data that was in CDC tables should have been copied to cdc_raw, # then used in commitlog replay, so it should be back in the cluster. assert (inserted_rows == data_in_cdc_table_after_restart ), 'not all expected data selected' if self.cluster.version() >= '4.0': # Create ReplayData objects for each index file found in loading cluster loading_path = os.path.join(loading_node.get_path(), 'cdc_raw') dest_cdc_indexes = [ ReplayData.load(loading_path, name) for name in os.listdir(loading_path) if name.endswith('_cdc.idx') ] # Compare source replay data to dest to ensure replay process created both hard links and index files. for srd in source_cdc_indexes: # Confirm both log and index are in dest assert os.path.isfile(os.path.join(loading_path, srd.idx_name)) assert os.path.isfile(os.path.join(loading_path, srd.log_name)) # Find dest ReplayData that corresponds to the source (should be exactly 1) corresponding_dest_replay_datae = [ x for x in dest_cdc_indexes if srd.idx_name == x.idx_name ] assert_length_equal(corresponding_dest_replay_datae, 1) drd = corresponding_dest_replay_datae[0] # We can't compare equality on offsets since replay uses the raw file length as the written # cdc offset. We *can*, however, confirm that the offset in the replayed file is >= # the source file, ensuring clients are signaled to replay at least all the data in the # log. assert drd.offset >= srd.offset # Confirm completed flag is the same in both assert srd.completed == drd.completed # Confirm that the relationship between index files on the source # and destination looks like we expect. # First, grab the mapping between the two, make sure it's a 1-1 # mapping, and transform the dict to reflect that: src_to_dest_idx_map = { src_rd: [ dest_rd for dest_rd in dest_cdc_indexes if dest_rd.idx_name == src_rd.idx_name ] for src_rd in source_cdc_indexes } for src_rd, dest_rds in src_to_dest_idx_map.items(): assert_length_equal(dest_rds, 1) src_to_dest_idx_map[src_rd] = dest_rds[0] # All offsets in idx files that were copied should be >0 on the # destination node. assert ( 0 not in {i.offset for i in src_to_dest_idx_map.values()}),\ ('Found index offsets == 0 in an index file on the ' 'destination node that corresponds to an index file on the ' 'source node:\n' '{}').format(pformat(src_to_dest_idx_map)) # Offsets of all shared indexes should be >= on the destination # than on the source. for src_rd, dest_rd in src_to_dest_idx_map.items(): assert dest_rd.offset >= src_rd.offset src_to_dest_idx_map = { src_rd: [ dest_rd for dest_rd in dest_cdc_indexes if dest_rd.idx_name == src_rd.idx_name ] for src_rd in source_cdc_indexes } for k, v in src_to_dest_idx_map.items(): assert_length_equal(v, 1) assert k.offset >= v.offset
def test_cdc_data_available_in_cdc_raw(self): ks_name = 'ks' # First, create a new node just for data generation. generation_node, generation_session = self.prepare(ks_name=ks_name) cdc_table_info = TableInfo( ks_name=ks_name, table_name='cdc_tab', column_spec=_16_uuid_column_spec, insert_stmt=_get_16_uuid_insert_stmt(ks_name, 'cdc_tab'), options={ 'cdc': 'true', # give table an explicit id so when we create it again it's the # same table and we can replay into it 'id': uuid.uuid4() } ) # Write until we get a new CL segment to avoid replaying initialization # mutations from this node's startup into system tables in the other # node. See CASSANDRA-11811. advance_to_next_cl_segment( session=generation_session, commitlog_dir=os.path.join(generation_node.get_path(), 'commitlogs') ) generation_session.execute(cdc_table_info.create_stmt) # insert 10000 rows inserted_rows = _insert_rows(generation_session, cdc_table_info.name, cdc_table_info.insert_stmt, repeat((), 10000)) # drain the node to guarantee all cl segements will be recycled debug('draining') generation_node.drain() debug('stopping') # stop the node and clean up all sessions attached to it generation_node.stop() generation_session.cluster.shutdown() # create a new node to use for cdc_raw cl segment replay loading_node = self._init_new_loading_node(ks_name, cdc_table_info.create_stmt, self.cluster.version() < '4') # move cdc_raw contents to commitlog directories, then start the # node again to trigger commitlog replay, which should replay the # cdc_raw files we moved to commitlogs into memtables. debug('moving cdc_raw and restarting node') _move_contents( os.path.join(generation_node.get_path(), 'cdc_raw'), os.path.join(loading_node.get_path(), 'commitlogs') ) loading_node.start(wait_for_binary_proto=True) debug('node successfully started; waiting on log replay') loading_node.grep_log('Log replay complete') debug('log replay complete') # final assertions validation_session = self.patient_exclusive_cql_connection(loading_node) data_in_cdc_table_after_restart = rows_to_list( validation_session.execute('SELECT * FROM ' + cdc_table_info.name) ) debug('found {cdc} values in CDC table'.format( cdc=len(data_in_cdc_table_after_restart) )) # Then we assert that the CDC data that we expect to be there is there. # All data that was in CDC tables should have been copied to cdc_raw, # then used in commitlog replay, so it should be back in the cluster. self.assertEqual( inserted_rows, data_in_cdc_table_after_restart, # The message on failure is too long, since cdc_data is thousands # of items, so we print something else here msg='not all expected data selected' )
def json_tools_test(self): debug("Starting cluster...") cluster = self.cluster cluster.set_batch_commitlog(enabled=True) cluster.populate(1).start() debug("Version: " + cluster.version().vstring) debug("Getting CQLSH...") [node1] = cluster.nodelist() session = self.patient_cql_connection(node1) debug("Inserting data...") create_ks(session, 'Test', 1) session.execute(""" CREATE TABLE users ( user_name varchar PRIMARY KEY, password varchar, gender varchar, state varchar, birth_year bigint ); """) session.execute("INSERT INTO Test. users (user_name, password, gender, state, birth_year) VALUES('frodo', 'pass@', 'male', 'CA', 1985);") session.execute("INSERT INTO Test. users (user_name, password, gender, state, birth_year) VALUES('sam', '@pass', 'male', 'NY', 1980);") res = session.execute("SELECT * FROM Test. users") self.assertItemsEqual(rows_to_list(res), [[u'frodo', 1985, u'male', u'pass@', u'CA'], [u'sam', 1980, u'male', u'@pass', u'NY']]) debug("Flushing and stopping cluster...") node1.flush() cluster.stop() debug("Exporting to JSON file...") json_path = tempfile.mktemp(suffix='.schema.json') with open(json_path, 'w') as f: node1.run_sstable2json(f) with open(json_path, 'r') as fin: data = fin.read().splitlines(True) if data[0][0] == 'W': with open(json_path, 'w') as fout: fout.writelines(data[1:]) debug("Deleting cluster and creating new...") cluster.clear() cluster.start() debug("Inserting data...") session = self.patient_cql_connection(node1) create_ks(session, 'Test', 1) session.execute(""" CREATE TABLE users ( user_name varchar PRIMARY KEY, password varchar, gender varchar, state varchar, birth_year bigint ); """) session.execute("INSERT INTO Test. users (user_name, password, gender, state, birth_year) VALUES('gandalf', 'p@$$', 'male', 'WA', 1955);") node1.flush() cluster.stop() debug("Importing JSON file...") with open(json_path) as f: node1.run_json2sstable(f, "test", "users") os.remove(json_path) debug("Verifying import...") cluster.start() [node1] = cluster.nodelist() session = self.patient_cql_connection(node1) res = session.execute("SELECT * FROM Test. users") debug("data: " + str(res)) self.assertItemsEqual(rows_to_list(res), [[u'frodo', 1985, u'male', u'pass@', u'CA'], [u'sam', 1980, u'male', u'@pass', u'NY'], [u'gandalf', 1955, u'male', u'p@$$', u'WA']])