class StorageService(object): def __init__(self, node): assert isinstance(node, Node) self.node = node self.jmx = JolokiaAgent(node) self.mbean = make_mbean("db", type="StorageService") def start(self): self.jmx.start() def stop(self): self.jmx.stop() def get_replicas(self, ks, cf, key): return self.jmx.execute_method(self.mbean, "getNaturalEndpointsWithPort(java.lang.String,java.lang.String,java.lang.String,boolean)", [ks, cf, key, True]) def __enter__(self): """ For contextmanager-style usage. """ self.start() return self def __exit__(self, exc_type, value, traceback): """ For contextmanager-style usage. """ self.stop()
class StorageService(object): def __init__(self, node): assert isinstance(node, Node) self.node = node self.jmx = JolokiaAgent(node) self.mbean = make_mbean("db", type="StorageService") def start(self): self.jmx.start() def stop(self): self.jmx.stop() def get_replicas(self, ks, cf, key): return self.jmx.execute_method( self.mbean, "getNaturalEndpointsWithPort(java.lang.String,java.lang.String,java.lang.String,boolean)", [ks, cf, key, True]) def __enter__(self): """ For contextmanager-style usage. """ self.start() return self def __exit__(self, exc_type, value, traceback): """ For contextmanager-style usage. """ self.stop()
def test_reloadlocalschema(self): """ @jira_ticket CASSANDRA-13954 Test that `nodetool reloadlocalschema` works as intended """ cluster = self.cluster cluster.populate(1) node = cluster.nodelist()[0] remove_perf_disable_shared_mem(node) # for jmx cluster.start() session = self.patient_cql_connection(node) query = "CREATE KEYSPACE IF NOT EXISTS test WITH replication = {'class': 'NetworkTopologyStrategy', 'datacenter1': 2};" session.execute(query) query = 'CREATE TABLE test.test (pk int, ck int, PRIMARY KEY (pk, ck));' session.execute(query) ss = make_mbean('db', type='StorageService') schema_version = '' # get initial schema version with JolokiaAgent(node) as jmx: schema_version = jmx.read_attribute(ss, 'SchemaVersion') # manually add a regular column 'val' to test.test query = """ INSERT INTO system_schema.columns (keyspace_name, table_name, column_name, clustering_order, column_name_bytes, kind, position, type) VALUES ('test', 'test', 'val', 'none', 0x76616c, 'regular', -1, 'int');""" session.execute(query) # validate that schema version wasn't automatically updated with JolokiaAgent(node) as jmx: self.assertEqual(schema_version, jmx.read_attribute(ss, 'SchemaVersion')) # make sure the new column wasn't automagically picked up assert_invalid( session, 'INSERT INTO test.test (pk, ck, val) VALUES (0, 1, 2);') # force the node to reload schema from disk node.nodetool('reloadlocalschema') # validate that schema version changed with JolokiaAgent(node) as jmx: self.assertNotEqual(schema_version, jmx.read_attribute(ss, 'SchemaVersion')) # try an insert with the new column again and validate it succeeds this time session.execute( 'INSERT INTO test.test (pk, ck, val) VALUES (0, 1, 2);') assert_all(session, 'SELECT pk, ck, val FROM test.test;', [[0, 1, 2]])
def test_repaired_tracking_with_partition_deletes(self): """ check that when an tracking repaired data status following a digest mismatch, repaired data mismatches are marked as unconfirmed as we may skip sstables after the partition delete are encountered. @jira_ticket CASSANDRA-14145 """ session, node1, node2 = self.setup_for_repaired_data_tracking() stmt = SimpleStatement("INSERT INTO ks.tbl (k, c, v) VALUES (%s, %s, %s)") stmt.consistency_level = ConsistencyLevel.ALL for i in range(10): session.execute(stmt, (i, i, i)) for node in self.cluster.nodelist(): node.flush() self.assertNoRepairedSSTables(node, 'ks') node1.repair(options=['ks']) node2.stop(wait_other_notice=True) session.execute("delete from ks.tbl where k = 5") node1.flush() node2.start() # expect unconfirmed inconsistencies as the partition deletes cause some sstables to be skipped with JolokiaAgent(node1) as jmx: self.query_and_check_repaired_mismatches(jmx, session, "SELECT * FROM ks.tbl WHERE k = 5", expect_unconfirmed_inconsistencies=True) self.query_and_check_repaired_mismatches(jmx, session, "SELECT * FROM ks.tbl WHERE k = 5 AND c = 5", expect_unconfirmed_inconsistencies=True) # no digest reads for range queries so blocking read repair metric isn't incremented # *all* sstables are read for partition ranges too, and as the repaired set is still in sync there should # be no inconsistencies self.query_and_check_repaired_mismatches(jmx, session, "SELECT * FROM ks.tbl", expect_read_repair=False)
def test_blacklisted_directory(self): cluster = self.cluster cluster.set_datadir_count(3) cluster.populate(1) [node] = cluster.nodelist() cluster.start() session = self.patient_cql_connection(node) create_ks(session, 'ks', 1) create_c1c2_table(self, session) insert_c1c2(session, n=10000) node.flush() for k in range(0, 10000): query_c1c2(session, k) node.compact() mbean = make_mbean('db', type='BlacklistedDirectories') with JolokiaAgent(node) as jmx: jmx.execute_method(mbean, 'markUnwritable', [os.path.join(node.get_path(), 'data1')]) for k in range(0, 10000): query_c1c2(session, k) node.nodetool('relocatesstables') for k in range(0, 10000): query_c1c2(session, k)
def test_oversized_mutation(self): """ Test that multi-DC write failures return operation failed rather than a timeout. @jira_ticket CASSANDRA-16334. """ cluster = self.cluster cluster.populate([2, 2]) cluster.set_configuration_options( values={'max_mutation_size_in_kb': 128}) cluster.start() node1 = cluster.nodelist()[0] session = self.patient_exclusive_cql_connection(node1) session.execute( "CREATE KEYSPACE k WITH replication = {'class': 'NetworkTopologyStrategy', 'dc1': 2, 'dc2': 2}" ) session.execute("CREATE TABLE k.t (key int PRIMARY KEY, val blob)") payload = '1' * 1024 * 256 query = "INSERT INTO k.t (key, val) VALUES (1, textAsBlob('{}'))".format( payload) assert_write_failure(session, query, ConsistencyLevel.LOCAL_ONE) assert_write_failure(session, query, ConsistencyLevel.ONE) # verify that no hints are created with JolokiaAgent(node1) as jmx: assert 0 == jmx.read_attribute( make_mbean('metrics', type='Storage', name='TotalHints'), 'Count')
def overlapping_data_folders(self): """ @jira_ticket CASSANDRA-10902 """ self.cluster.populate(1) node1 = self.cluster.nodelist()[0] default_path = node1.data_directories()[0] node1.set_configuration_options({ 'saved_caches_directory': os.path.join(default_path, 'saved_caches') }) remove_perf_disable_shared_mem(node1) self.cluster.start(wait_for_binary_proto=True) session = self.patient_exclusive_cql_connection(node1) session.execute( "CREATE KEYSPACE ks WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1}" ) session.execute("CREATE TABLE ks.tab (key int PRIMARY KEY, a int)") session.execute("INSERT INTO ks.tab (key, a) VALUES (%s, %s)", [0, 0]) session.execute("SELECT * FROM ks.tab WHERE key = %s", [0]) cache_service = make_mbean('db', type="Caches") with JolokiaAgent(node1) as jmx: jmx.execute_method(cache_service, 'saveCaches') self.cluster.stop() self.cluster.start(wait_for_binary_proto=True)
def test_tracing_does_not_interfere_with_digest_calculation(self): """ Test that enabling tracing doesn't interfere with digest responses when using RandomPartitioner. The use of a threadlocal MessageDigest for generating both DigestResponse messages and for calculating tokens meant that the DigestResponse was always incorrect when both RP and tracing were enabled, leading to unnecessary data reads. @jira_ticket CASSANDRA-13964 """ session = self.prepare(random_partitioner=True) self.trace(session) node1 = self.cluster.nodelist()[0] rr_count = make_mbean('metrics', type='ReadRepair', name='RepairedBlocking') with JolokiaAgent(node1) as jmx: # the MBean may not have been initialized, in which case Jolokia agent will return # a HTTP 404 response. If we receive such, we know that no digest mismatch was reported # If we are able to read the MBean attribute, assert that the count is 0 if jmx.has_mbean(rr_count): # expect 0 digest mismatches assert 0 == jmx.read_attribute(rr_count, 'Count') else: pass
def _batchlog_replay_compatibility_test(self, coordinator_idx, current_nodes, previous_version, previous_nodes, protocol_version): session = self.prepare_mixed(coordinator_idx, current_nodes, previous_version, previous_nodes, protocol_version=protocol_version, install_byteman=True) coordinator = self.cluster.nodelist()[coordinator_idx] coordinator.byteman_submit( [mk_bman_path('fail_after_batchlog_write.btm')]) logger.debug( "Injected byteman scripts to enable batchlog replay {}".format( coordinator.name)) query = """ BEGIN BATCH INSERT INTO users (id, firstname, lastname) VALUES (0, 'Jack', 'Sparrow') INSERT INTO users (id, firstname, lastname) VALUES (1, 'Will', 'Turner') APPLY BATCH """ session.execute(query) # batchlog replay skips over all entries that are younger than # 2 * write_request_timeout_in_ms ms: 1x timeout for all mutations to be written, # and another 1x timeout for batch remove mutation to be received. delay = 2 * coordinator.get_conf_option( 'write_request_timeout_in_ms') / 1000.0 + 1 logger.debug( 'Sleeping for {}s for the batches to not be skipped'.format(delay)) time.sleep(delay) total_batches_replayed = 0 blm = make_mbean('db', type='BatchlogManager') for n in self.cluster.nodelist(): if n == coordinator: continue with JolokiaAgent(n) as jmx: logger.debug('Forcing batchlog replay for {}'.format(n.name)) jmx.execute_method(blm, 'forceBatchlogReplay') batches_replayed = jmx.read_attribute(blm, 'TotalBatchesReplayed') logger.debug('{} batches replayed on node {}'.format( batches_replayed, n.name)) total_batches_replayed += batches_replayed assert total_batches_replayed >= 2 for node in self.cluster.nodelist(): session = self.patient_exclusive_cql_connection( node, protocol_version=protocol_version) rows = sorted( session.execute( 'SELECT id, firstname, lastname FROM ks.users')) assert [[0, 'Jack', 'Sparrow'], [1, 'Will', 'Turner']], [list(rows[0]) == list(rows[1])]
def test_tracing_does_not_interfere_with_digest_calculation(self): """ Test that enabling tracing doesn't interfere with digest responses when using RandomPartitioner. The use of a threadlocal MessageDigest for generating both DigestResponse messages and for calculating tokens meant that the DigestResponse was always incorrect when both RP and tracing were enabled, leading to unnecessary data reads. @jira_ticket CASSANDRA-13964 """ cluster = self.cluster cluster.populate(3) cluster.set_configuration_options( values={ 'write_request_timeout_in_ms': 30000, 'read_request_timeout_in_ms': 30000 }) cluster.set_partitioner("org.apache.cassandra.dht.RandomPartitioner") cluster.start( jvm_args=['-Dcassandra.wait_for_tracing_events_timeout_secs=15']) node1 = cluster.nodelist()[0] session = self.patient_cql_connection(node1) create_ks(session, 'ks', 3) session.execute(""" CREATE TABLE ks.users ( userid uuid PRIMARY KEY, firstname text, lastname text, age int ); """) insert = session.prepare( "INSERT INTO ks.users (userid, firstname, lastname, age) " "VALUES (?, 'Frodo', 'Baggins', 32)") insert.consistency_level = ConsistencyLevel.ALL select = session.prepare("SELECT firstname, lastname " "FROM ks.users WHERE userid = ?") select.consistency_level = ConsistencyLevel.ALL for _ in range(10): id = uuid4() session.execute(insert.bind((id, )), timeout=30) res = session.execute(select.bind((id, )), timeout=30, trace=True) assert 1 == len(res.response_future.get_query_trace_ids()) rr_count = make_mbean('metrics', type='ReadRepair', name='RepairedBlocking') with JolokiaAgent(node1) as jmx: # the MBean may not have been initialized, in which case Jolokia agent will return # a HTTP 404 response. If we receive such, we know that no digest mismatch was reported # If we are able to read the MBean attribute, assert that the count is 0 if jmx.has_mbean(rr_count): # expect 0 digest mismatches assert 0 == jmx.read_attribute(rr_count, 'Count') else: pass
def setup_once(self): cluster = self.cluster cluster.set_configuration_options({ 'read_request_timeout_in_ms': 3000, 'write_request_timeout_in_ms': 3000, 'phi_convict_threshold': 12, 'tombstone_failure_threshold': TOMBSTONE_FAILURE_THRESHOLD, 'enable_materialized_views': 'true' }) cluster.populate(2, debug=True) cluster.start(jvm_args=JVM_ARGS) node1 = cluster.nodelist()[0] global jmx jmx = JolokiaAgent(node1) jmx.start() s = self.session = self.patient_exclusive_cql_connection( node1, retry_policy=FallthroughRetryPolicy(), request_timeout=30) for k in [KEYSPACE, FAIL_WRITE_KEYSPACE]: create_ks(s, k, 2) s.execute( f"CREATE TABLE {k}.{TABLE} (k int, c int, v int, PRIMARY KEY (k,c))" ) create_ks(s, VIEW_KEYSPACE, 1) s.execute( f"CREATE TABLE {VIEW_KEYSPACE}.{TABLE} (k int, c int, v int, PRIMARY KEY (k,c))" ) s.execute( f"CREATE MATERIALIZED VIEW {VIEW_KEYSPACE}.{VIEW} AS SELECT * FROM {TABLE} WHERE c IS NOT NULL AND k IS NOT NULL PRIMARY KEY (c,k);" ) # Here we're doing a series of deletions in order to create enough tombstones to exceed the configured fail threshold. # This partition will be used to test read failures. for c in range(TOMBSTONE_FAILURE_THRESHOLD + 1): self.session.execute( f"DELETE FROM {KEYSPACE}.{TABLE} WHERE k={TOMBSTONE_FAIL_KEY} AND c={c}" ) node1.watch_log_for( "Created default superuser role 'cassandra'" ) # don't race with async default role creation, which creates a write node1.watch_log_for( 'Completed submission of build tasks for any materialized views defined at startup', filename='debug.log') # view builds cause background reads
class StorageProxy(object): def __init__(self, node): assert isinstance(node, Node) self.node = node self.jmx = JolokiaAgent(node) self.mbean = make_mbean("db", type="StorageProxy") def start(self): self.jmx.start() def stop(self): self.jmx.stop() @property def blocking_read_repair(self): return self.jmx.read_attribute(self.mbean, "ReadRepairRepairedBlocking") @property def speculated_data_request(self): return self.jmx.read_attribute(self.mbean, "ReadRepairSpeculatedRequest") @property def speculated_data_repair(self): return self.jmx.read_attribute(self.mbean, "ReadRepairSpeculatedRepair") def __enter__(self): """ For contextmanager-style usage. """ self.start() return self def __exit__(self, exc_type, value, traceback): """ For contextmanager-style usage. """ self.stop()
class TableMetrics(object): def __init__(self, node, keyspace, table): assert isinstance(node, Node) self.jmx = JolokiaAgent(node) self.write_latency_mbean = make_mbean("metrics", type="Table", name="WriteLatency", keyspace=keyspace, scope=table) self.speculative_reads_mbean = make_mbean("metrics", type="Table", name="SpeculativeRetries", keyspace=keyspace, scope=table) self.transient_writes_mbean = make_mbean("metrics", type="Table", name="TransientWrites", keyspace=keyspace, scope=table) @property def write_count(self): return self.jmx.read_attribute(self.write_latency_mbean, "Count") @property def speculative_reads(self): return self.jmx.read_attribute(self.speculative_reads_mbean, "Count") @property def transient_writes(self): return self.jmx.read_attribute(self.transient_writes_mbean, "Count") def start(self): self.jmx.start() def stop(self): self.jmx.stop() def __enter__(self): """ For contextmanager-style usage. """ self.start() return self def __exit__(self, exc_type, value, traceback): """ For contextmanager-style usage. """ self.stop()
def table_metric(node, keyspace, table, name): version = node.get_cassandra_version() typeName = "ColumnFamily" if version < '3.0' else 'Table' with JolokiaAgent(node) as jmx: mbean = make_mbean('metrics', type=typeName, name=name, keyspace=keyspace, scope=table) value = jmx.read_attribute(mbean, 'Value') return value
def assert_parent_repair_session_count(nodes, expected): for node in nodes: with JolokiaAgent(node) as jmx: result = jmx.execute_method( "org.apache.cassandra.db:type=RepairService", "parentRepairSessionsCount") assert expected == result, "The number of cached ParentRepairSessions should be {} but was {}. " \ "This may mean that PRS objects are leaking on node {}. Check " \ "ActiveRepairService for PRS clean up code.".format(expected, result, node.name)
def __init__(self, node, keyspace, table): assert isinstance(node, Node) self.jmx = JolokiaAgent(node) self.write_latency_mbean = make_mbean("metrics", type="Table", name="WriteLatency", keyspace=keyspace, scope=table) self.speculative_reads_mbean = make_mbean("metrics", type="Table", name="SpeculativeRetries", keyspace=keyspace, scope=table) self.transient_writes_mbean = make_mbean("metrics", type="Table", name="TransientWrites", keyspace=keyspace, scope=table)
def test_repaired_tracking_with_mismatching_replicas(self): """ verify that when replicas have different repaired sets, this can be detected via the digests computed at read time. All nodes have start with the same data, but only 1 replica's sstables are marked repaired. Then a divergence is introduced by overwriting on 1 replica only, which is required to trigger a digest mismatch & full data read (for single partition reads). As the repaired sets are different between the replicas, but no other shortcutting occurs (no partition tombstones or sstable skipping) and no sstables are involved in pending repair session, we expect confirmed inconsistencies to be reported. there are two variants of this, for single partition slice & names reads and range reads @jira_ticket CASSANDRA-14145 """ session, node1, node2 = self.setup_for_repaired_data_tracking() stmt = SimpleStatement("INSERT INTO ks.tbl (k, c, v) VALUES (%s, %s, %s)") stmt.consistency_level = ConsistencyLevel.ALL for i in range(10): session.execute(stmt, (i, i, i)) for node in self.cluster.nodelist(): node.flush() for i in range(10,20): session.execute(stmt, (i, i, i)) for node in self.cluster.nodelist(): node.flush() self.assertNoRepairedSSTables(node, 'ks') # stop node 2 and mark its sstables repaired node2.stop(wait_other_notice=True) node2.run_sstablerepairedset(keyspace='ks') # before restarting node2 overwrite some data on node1 to trigger digest mismatches session.execute("insert into ks.tbl (k, c, v) values (5, 5, 55)") node2.start(wait_for_binary_proto=True) out1 = node1.run_sstablemetadata(keyspace='ks').stdout out2 = node2.run_sstablemetadata(keyspace='ks').stdout # verify the repaired at times for the sstables on node1/node2 assert all(t == 0 for t in [int(x) for x in [y.split(' ')[0] for y in findall('(?<=Repaired at: ).*', out1)]]) assert all(t > 0 for t in [int(x) for x in [y.split(' ')[0] for y in findall('(?<=Repaired at: ).*', out2)]]) # we expect inconsistencies due to sstables being marked repaired on one replica only # these are marked confirmed because no sessions are pending & all sstables are # skipped due to partition deletes with JolokiaAgent(node1) as jmx: self.query_and_check_repaired_mismatches(jmx, session, "SELECT * FROM ks.tbl WHERE k = 5", expect_confirmed_inconsistencies=True) self.query_and_check_repaired_mismatches(jmx, session, "SELECT * FROM ks.tbl WHERE k = 5 AND c = 5", expect_confirmed_inconsistencies=True) # no digest reads for range queries so read repair metric isn't incremented self.query_and_check_repaired_mismatches(jmx, session, "SELECT * FROM ks.tbl", expect_confirmed_inconsistencies=True, expect_read_repair=False)
def test_closing_connections(self): """ @jira_ticket CASSANDRA-6546 Test CASSANDRA-6546 - do connections get closed when disabling / renabling thrift service? """ cluster = self.cluster cluster.set_configuration_options(values={ 'start_rpc': 'true', 'rpc_server_type': 'hsha', 'rpc_max_threads': 20 }) cluster.populate(1) (node1, ) = cluster.nodelist() remove_perf_disable_shared_mem(node1) cluster.start() session = self.patient_cql_connection(node1) create_ks(session, 'test', 1) session.execute( "CREATE TABLE \"CF\" (key text PRIMARY KEY, val text) WITH COMPACT STORAGE;" ) def make_connection(): host, port = node1.network_interfaces['thrift'] client = get_thrift_client(host, port) client.transport.open() return client pools = [] connected_thrift_clients = make_mbean('metrics', type='Client', name='connectedThriftClients') for i in range(10): logger.debug("Creating connection pools..") for x in range(3): pools.append(make_connection()) logger.debug( "Disabling/Enabling thrift iteration #{i}".format(i=i)) node1.nodetool('disablethrift') node1.nodetool('enablethrift') logger.debug("Closing connections from the client side..") for client in pools: client.transport.close() with JolokiaAgent(node1) as jmx: num_clients = jmx.read_attribute(connected_thrift_clients, "Value") assert int( num_clients ) == 0, "There are still open Thrift connections after stopping service " + str( num_clients)
def test_closing_connections(self): """ @jira_ticket CASSANDRA-6546 Test CASSANDRA-6546 - do connections get closed when disabling / renabling thrift service? """ cluster = self.cluster cluster.set_configuration_options(values={ 'start_rpc': 'true', 'rpc_server_type': 'hsha', 'rpc_max_threads': 20 }) cluster.populate(1) (node1, ) = cluster.nodelist() remove_perf_disable_shared_mem(node1) cluster.start(wait_for_binary_proto=True) session = self.patient_cql_connection(node1) self.create_ks(session, 'test', 1) session.execute( "CREATE TABLE \"CF\" (key text PRIMARY KEY, val text) WITH COMPACT STORAGE;" ) def make_connection(): pool = pycassa.ConnectionPool('test', timeout=None) cf = pycassa.ColumnFamily(pool, 'CF') return pool pools = [] connected_thrift_clients = make_mbean('metrics', type='Client', name='connectedThriftClients') for i in xrange(10): debug("Creating connection pools..") for x in xrange(3): pools.append(make_connection()) debug("Disabling/Enabling thrift iteration #{i}".format(i=i)) node1.nodetool('disablethrift') node1.nodetool('enablethrift') debug("Closing connections from the client side..") for pool in pools: pool.dispose() with JolokiaAgent(node1) as jmx: num_clients = jmx.read_attribute(connected_thrift_clients, "Value") self.assertEqual( int(num_clients), 0, "There are still open Thrift connections after stopping service" )
def _batchlog_replay_compatibility_test(self, coordinator_idx, current_nodes, previous_version, previous_nodes, protocol_version): session = self.prepare_mixed(coordinator_idx, current_nodes, previous_version, previous_nodes, protocol_version=protocol_version, install_byteman=True) coordinator = self.cluster.nodelist()[coordinator_idx] coordinator.byteman_submit(['./byteman/fail_after_batchlog_write.btm']) debug("Injected byteman scripts to enable batchlog replay {}".format( coordinator.name)) query = """ BEGIN BATCH INSERT INTO users (id, firstname, lastname) VALUES (0, 'Jack', 'Sparrow') INSERT INTO users (id, firstname, lastname) VALUES (1, 'Will', 'Turner') APPLY BATCH """ session.execute(query) total_batches_replayed = 0 blm = make_mbean('db', type='BatchlogManager') for n in self.cluster.nodelist(): if n == coordinator: continue with JolokiaAgent(n) as jmx: debug('Forcing batchlog replay for {}'.format(n.name)) jmx.execute_method(blm, 'forceBatchlogReplay') batches_replayed = jmx.read_attribute(blm, 'TotalBatchesReplayed') debug('{} batches replayed on node {}'.format( batches_replayed, n.name)) total_batches_replayed += batches_replayed assert_greater_equal(total_batches_replayed, 2) for node in self.cluster.nodelist(): session = self.patient_exclusive_cql_connection( node, protocol_version=protocol_version) rows = sorted( session.execute( 'SELECT id, firstname, lastname FROM ks.users')) self.assertEqual([[0, 'Jack', 'Sparrow'], [1, 'Will', 'Turner']], [list(rows[0]), list(rows[1])])
def test_repaired_tracking_with_varying_sstable_sets(self): """ verify that repaired data digests are computed over the merged data for each replica and that the particular number of sstables on each doesn't affect the comparisons both replicas start with the same repaired set, comprising 2 sstables. node1's is then compacted and additional unrepaired data added (which overwrites some in the repaired set). We expect the repaired digests to still match as the tracking will force all sstables containing the partitions to be read there are two variants of this, for single partition slice & names reads and range reads @jira_ticket CASSANDRA-14145 """ session, node1, node2 = self.setup_for_repaired_data_tracking() stmt = SimpleStatement( "INSERT INTO ks.tbl (k, c, v) VALUES (%s, %s, %s)") stmt.consistency_level = ConsistencyLevel.ALL for i in range(10): session.execute(stmt, (i, i, i)) for node in self.cluster.nodelist(): node.flush() for i in range(10, 20): session.execute(stmt, (i, i, i)) for node in self.cluster.nodelist(): node.flush() self.assertNoRepairedSSTables(node, 'ks') node1.repair(options=['ks']) node2.stop(wait_other_notice=True) session.execute("insert into ks.tbl (k, c, v) values (5, 5, 55)") session.execute("insert into ks.tbl (k, c, v) values (15, 15, 155)") node1.flush() node1.compact() node1.compact() node2.start() # we don't expect any inconsistencies as all repaired data is read on both replicas with JolokiaAgent(node1) as jmx: self.query_and_check_repaired_mismatches( jmx, session, "SELECT * FROM ks.tbl WHERE k = 5") self.query_and_check_repaired_mismatches( jmx, session, "SELECT * FROM ks.tbl WHERE k = 5 AND c = 5") # no digest reads for range queries so read repair metric isn't incremented self.query_and_check_repaired_mismatches(jmx, session, "SELECT * FROM ks.tbl", expect_read_repair=False)
def table_metric_mbeans_test(self): """ Test some basic table metric mbeans with simple writes. """ cluster = self.cluster cluster.populate(3) node1, node2, node3 = cluster.nodelist() remove_perf_disable_shared_mem(node1) cluster.start(wait_for_binary_proto=True) version = cluster.version() node1.stress([ 'write', 'n=10K', 'no-warmup', '-schema', 'replication(factor=3)' ]) typeName = "ColumnFamily" if version <= '2.2.X' else 'Table' debug('Version {} typeName {}'.format(version, typeName)) # TODO the keyspace and table name are capitalized in 2.0 memtable_size = make_mbean('metrics', type=typeName, keyspace='keyspace1', scope='standard1', name='AllMemtablesHeapSize') disk_size = make_mbean('metrics', type=typeName, keyspace='keyspace1', scope='standard1', name='LiveDiskSpaceUsed') sstable_count = make_mbean('metrics', type=typeName, keyspace='keyspace1', scope='standard1', name='LiveSSTableCount') with JolokiaAgent(node1) as jmx: mem_size = jmx.read_attribute(memtable_size, "Value") self.assertGreater(int(mem_size), 10000) on_disk_size = jmx.read_attribute(disk_size, "Count") self.assertEquals(int(on_disk_size), 0) node1.flush() on_disk_size = jmx.read_attribute(disk_size, "Count") self.assertGreater(int(on_disk_size), 10000) sstables = jmx.read_attribute(sstable_count, "Value") self.assertGreaterEqual(int(sstables), 1)
def _deprecated_repair_jmx(self, method, arguments): """ * Launch a two node, two DC cluster * Create a keyspace and table * Insert some data * Call the deprecated repair JMX API based on the arguments passed into this method * Check the node log to see if the correct repair was performed based on the jmx args """ cluster = self.cluster logger.debug("Starting cluster..") cluster.populate([1, 1]) node1, node2 = cluster.nodelist() cluster.start() supports_pull_repair = cluster.version() >= LooseVersion('3.10') session = self.patient_cql_connection(node1) create_ks(session, 'ks', 2) create_cf(session, 'cf', read_repair=0.0, columns={'c1': 'text', 'c2': 'text'}) insert_c1c2(session, n=1000, consistency=ConsistencyLevel.ALL) # Run repair mbean = make_mbean('db', 'StorageService') with JolokiaAgent(node1) as jmx: # assert repair runs and returns valid cmd number assert jmx.execute_method(mbean, method, arguments) == 1 # wait for log to start node1.watch_log_for("Starting repair command") # get repair parameters from the log line = node1.grep_log((r"Starting repair command #1" + (r" \([^\)]+\)" if cluster.version() >= LooseVersion("3.10") else "") + r", repairing keyspace ks with repair options \(parallelism: (?P<parallelism>\w+), primary range: (?P<pr>\w+), " r"incremental: (?P<incremental>\w+), job threads: (?P<jobs>\d+), ColumnFamilies: (?P<cfs>.+), dataCenters: (?P<dc>.+), " r"hosts: (?P<hosts>.+), # of ranges: (?P<ranges>\d+)(, pull repair: (?P<pullrepair>true|false))?(, ignore unreplicated keyspaces: (?P<ignoreunrepl>true|false))?\)")) assert_length_equal(line, 1) line, m = line[0] if supports_pull_repair: assert m.group("pullrepair"), "false" == "Pull repair cannot be enabled through the deprecated API so the pull repair option should always be false." return {"parallelism": m.group("parallelism"), "primary_range": m.group("pr"), "incremental": m.group("incremental"), "job_threads": m.group("jobs"), "column_families": m.group("cfs"), "data_centers": m.group("dc"), "hosts": m.group("hosts"), "ranges": m.group("ranges")}
def test_table_metric_mbeans(self): """ Test some basic table metric mbeans with simple writes. """ cluster = self.cluster cluster.populate(3) node1, node2, node3 = cluster.nodelist() cluster.start() version = cluster.version() node1.stress([ 'write', 'n=10K', 'no-warmup', '-schema', 'replication(factor=3)' ]) typeName = "ColumnFamily" if version < '3.0' else 'Table' logger.debug('Version {} typeName {}'.format(version, typeName)) # TODO the keyspace and table name are capitalized in 2.0 memtable_size = make_mbean('metrics', type=typeName, keyspace='keyspace1', scope='standard1', name='AllMemtablesHeapSize') disk_size = make_mbean('metrics', type=typeName, keyspace='keyspace1', scope='standard1', name='LiveDiskSpaceUsed') sstable_count = make_mbean('metrics', type=typeName, keyspace='keyspace1', scope='standard1', name='LiveSSTableCount') with JolokiaAgent(node1) as jmx: mem_size = jmx.read_attribute(memtable_size, "Value") assert int(mem_size) > 10000 on_disk_size = jmx.read_attribute(disk_size, "Count") assert int(on_disk_size) == 0 node1.flush() on_disk_size = jmx.read_attribute(disk_size, "Count") assert int(on_disk_size) > 10000 sstables = jmx.read_attribute(sstable_count, "Value") assert int(sstables) >= 1
def test_set_get_batchlog_replay_throttle(self): """ @jira_ticket CASSANDRA-13614 Test that batchlog replay throttle can be set and get through JMX """ cluster = self.cluster cluster.populate(2) node = cluster.nodelist()[0] cluster.start() # Set and get throttle with JMX, ensuring that the rate change is logged with JolokiaAgent(node) as jmx: mbean = make_mbean('db', 'StorageService') jmx.write_attribute(mbean, 'BatchlogReplayThrottleInKB', 4096) assert len(node.grep_log('Updating batchlog replay throttle to 4096 KB/s, 2048 KB/s per endpoint', filename='debug.log')) > 0 assert 4096 == jmx.read_attribute(mbean, 'BatchlogReplayThrottleInKB')
class StorageProxy(object): def __init__(self, node): assert isinstance(node, Node) self.node = node self.jmx = JolokiaAgent(node) def start(self): self.jmx.start() def stop(self): self.jmx.stop() def _get_metric(self, metric): mbean = make_mbean("metrics", type="ReadRepair", name=metric) return self.jmx.read_attribute(mbean, "Count") @property def blocking_read_repair(self): return self._get_metric("RepairedBlocking") @property def speculated_rr_read(self): return self._get_metric("SpeculatedRead") @property def speculated_rr_write(self): return self._get_metric("SpeculatedWrite") def get_table_metric(self, keyspace, table, metric, attr="Count"): mbean = make_mbean("metrics", keyspace=keyspace, scope=table, type="Table", name=metric) return self.jmx.read_attribute(mbean, attr) def __enter__(self): """ For contextmanager-style usage. """ self.start() return self def __exit__(self, exc_type, value, traceback): """ For contextmanager-style usage. """ self.stop()
def test_bloom_filter_false_ratio(self): """ Test for CASSANDRA-15834 Verifies if BloomFilterFalseRatio takes into account true negatives. Without this fix, the following scenario (many reads for non-existing rows) would yield BloomFilterFalseRatio=1.0. With the fix we assume it should be less then the default bloom_filter_fp_chance. """ cluster = self.cluster cluster.populate(1) node = cluster.nodelist()[0] cluster.start(wait_for_binary_proto=True) session = self.patient_exclusive_cql_connection(node) keyspace = 'bloom_ratio_test_ks' create_ks(session, keyspace, 1) create_c1c2_table(self, session) insert_c1c2(session, n=10) node.nodetool("flush " + keyspace) for key in range(10000): session.execute("SELECT * from cf where key = '{0}'".format(key)) bloom_filter_false_ratios = [ make_mbean('metrics', type='Table', name='RecentBloomFilterFalseRatio'), make_mbean('metrics', type='Table', keyspace=keyspace, scope='cf', name='BloomFilterFalseRatio'), make_mbean('metrics', type='Table', name='BloomFilterFalseRatio'), make_mbean('metrics', type='Table', keyspace=keyspace, scope='cf', name='RecentBloomFilterFalseRatio'), ] with JolokiaAgent(node) as jmx: for metric in bloom_filter_false_ratios: ratio = jmx.read_attribute(metric, "Value") # Bloom filter false positive ratio should not be greater than the default bloom_filter_fp_chance. assert ratio < 0.01
def mv_metric_mbeans_release_test(self): """ Test that the right mbeans are created and released when creating mvs """ cluster = self.cluster cluster.populate(1) node = cluster.nodelist()[0] remove_perf_disable_shared_mem(node) cluster.start(wait_for_binary_proto=True) node.run_cqlsh(cmds=""" CREATE KEYSPACE mvtest WITH REPLICATION = { 'class' : 'SimpleStrategy', 'replication_factor': 1 }; CREATE TABLE mvtest.testtable ( foo int, bar text, baz text, PRIMARY KEY (foo, bar) ); CREATE MATERIALIZED VIEW mvtest.testmv AS SELECT foo, bar, baz FROM mvtest.testtable WHERE foo IS NOT NULL AND bar IS NOT NULL AND baz IS NOT NULL PRIMARY KEY (foo, bar, baz);""") table_memtable_size = make_mbean('metrics', type='Table', keyspace='mvtest', scope='testtable', name='AllMemtablesHeapSize') table_view_read_time = make_mbean('metrics', type='Table', keyspace='mvtest', scope='testtable', name='ViewReadTime') table_view_lock_time = make_mbean('metrics', type='Table', keyspace='mvtest', scope='testtable', name='ViewLockAcquireTime') mv_memtable_size = make_mbean('metrics', type='Table', keyspace='mvtest', scope='testmv', name='AllMemtablesHeapSize') mv_view_read_time = make_mbean('metrics', type='Table', keyspace='mvtest', scope='testmv', name='ViewReadTime') mv_view_lock_time = make_mbean('metrics', type='Table', keyspace='mvtest', scope='testmv', name='ViewLockAcquireTime') missing_metric_message = "Table metric %s should have been registered after creating table %s" \ "but wasn't!" with JolokiaAgent(node) as jmx: self.assertIsNotNone( jmx.read_attribute(table_memtable_size, "Value"), missing_metric_message.format("AllMemtablesHeapSize", "testtable")) self.assertIsNotNone( jmx.read_attribute(table_view_read_time, "Count"), missing_metric_message.format("ViewReadTime", "testtable")) self.assertIsNotNone( jmx.read_attribute(table_view_lock_time, "Count"), missing_metric_message.format("ViewLockAcquireTime", "testtable")) self.assertIsNotNone( jmx.read_attribute(mv_memtable_size, "Value"), missing_metric_message.format("AllMemtablesHeapSize", "testmv")) self.assertRaisesRegexp(Exception, ".*InstanceNotFoundException.*", jmx.read_attribute, mbean=mv_view_read_time, attribute="Count", verbose=False) self.assertRaisesRegexp(Exception, ".*InstanceNotFoundException.*", jmx.read_attribute, mbean=mv_view_lock_time, attribute="Count", verbose=False) node.run_cqlsh(cmds="DROP KEYSPACE mvtest;") with JolokiaAgent(node) as jmx: self.assertRaisesRegexp(Exception, ".*InstanceNotFoundException.*", jmx.read_attribute, mbean=table_memtable_size, attribute="Value", verbose=False) self.assertRaisesRegexp(Exception, ".*InstanceNotFoundException.*", jmx.read_attribute, mbean=table_view_lock_time, attribute="Count", verbose=False) self.assertRaisesRegexp(Exception, ".*InstanceNotFoundException.*", jmx.read_attribute, mbean=table_view_read_time, attribute="Count", verbose=False) self.assertRaisesRegexp(Exception, ".*InstanceNotFoundException.*", jmx.read_attribute, mbean=mv_memtable_size, attribute="Value", verbose=False) self.assertRaisesRegexp(Exception, ".*InstanceNotFoundException.*", jmx.read_attribute, mbean=mv_view_lock_time, attribute="Count", verbose=False) self.assertRaisesRegexp(Exception, ".*InstanceNotFoundException.*", jmx.read_attribute, mbean=mv_view_read_time, attribute="Count", verbose=False)
def __init__(self, node): assert isinstance(node, Node) self.node = node self.jmx = JolokiaAgent(node)
def test_compactionstats(self): """ @jira_ticket CASSANDRA-10504 @jira_ticket CASSANDRA-10427 Test that jmx MBean used by nodetool compactionstats properly updates the progress of a compaction """ cluster = self.cluster cluster.populate(1) node = cluster.nodelist()[0] remove_perf_disable_shared_mem(node) cluster.start(wait_for_binary_proto=True) # Run a quick stress command to create the keyspace and table node.stress(['write', 'n=1', 'no-warmup']) # Disable compaction on the table node.nodetool('disableautocompaction keyspace1 standard1') node.nodetool('setcompactionthroughput 1') node.stress(['write', 'n=150K', 'no-warmup']) node.flush() # Run a major compaction. This will be the compaction whose # progress we track. node.nodetool_process('compact') # We need to sleep here to give compaction time to start # Why not do something smarter? Because if the bug regresses, # we can't rely on jmx to tell us that compaction started. time.sleep(5) compaction_manager = make_mbean('db', type='CompactionManager') with JolokiaAgent(node) as jmx: progress_string = jmx.read_attribute(compaction_manager, 'CompactionSummary')[0] # Pause in between reads # to allow compaction to move forward time.sleep(2) updated_progress_string = jmx.read_attribute( compaction_manager, 'CompactionSummary')[0] var = 'Compaction@{uuid}(keyspace1, standard1, {progress}/{total})bytes' progress = int( parse.search(var, progress_string).named['progress']) updated_progress = int( parse.search(var, updated_progress_string).named['progress']) debug(progress_string) debug(updated_progress_string) # We want to make sure that the progress is increasing, # and that values other than zero are displayed. self.assertGreater(updated_progress, progress) self.assertGreaterEqual(progress, 0) self.assertGreater(updated_progress, 0) # Block until the major compaction is complete # Otherwise nodetool will throw an exception # Give a timeout, in case compaction is broken # and never ends. start = time.time() max_query_timeout = 600 debug("Waiting for compaction to finish:") while (len( jmx.read_attribute(compaction_manager, 'CompactionSummary')) > 0) and (time.time() - start < max_query_timeout): debug( jmx.read_attribute(compaction_manager, 'CompactionSummary')) time.sleep(2)
def commitlog_size(node): commitlog_size_mbean = make_mbean('metrics', type='CommitLog', name='TotalCommitLogSize') with JolokiaAgent(node) as jmx: return jmx.read_attribute(commitlog_size_mbean, 'Value')
def __init__(self, node): assert isinstance(node, Node) self.node = node self.jmx = JolokiaAgent(node) self.mbean = make_mbean("db", type="StorageProxy")
def preview_failure_count(node): mbean = make_mbean('metrics', type='Repair', name='PreviewFailures') with JolokiaAgent(node) as jmx: return jmx.read_attribute(mbean, 'Count')