def test_filter(self): log_content_before = self.get_event_logs() enospc_line = "[99.80.124.204] [stdout] Mar 31 09:08:10 warning| [shard 8] commitlog - Exception in segment reservation: storage_io_error (Storage I/O error: 28: No space left on device)" enospc_line_2 = "2019-10-29T12:19:49+00:00 ip-172-30-0-184 !WARNING | scylla: [shard 2] storage_service - " \ "Commitlog error: std::filesystem::__cxx11::filesystem_error (error system:28, filesystem error: open failed: No space left on device [/var/lib/scylla/hints/2/172.30.0.116/HintsLog-1-36028797019122576.log])" with DbEventsFilter(type='NO_SPACE_ERROR'), \ DbEventsFilter(type='BACKTRACE', line='No space left on device'), \ DbEventsFilter(type='DATABASE_ERROR', line='No space left on device'), \ DbEventsFilter(type='FILESYSTEM_ERROR', line='No space left on device'): DatabaseLogEvent(type="NO_SPACE_ERROR", regex="B").add_info_and_publish(node="A", line_number=22, line=enospc_line) DatabaseLogEvent(type="NO_SPACE_ERROR", regex="B").add_info_and_publish(node="A", line_number=22, line=enospc_line) DatabaseLogEvent(type="FILESYSTEM_ERROR", regex="B").add_info_and_publish( node="A", line_number=22, line=enospc_line_2) DatabaseLogEvent(type="NO_SPACE_ERROR", regex="B").add_info_and_publish(node="A", line_number=22, line=enospc_line) DatabaseLogEvent(type="NO_SPACE_ERROR", regex="B").add_info_and_publish(node="A", line_number=22, line=enospc_line) log_content_after = self.get_event_logs() self.assertEqual(log_content_before, log_content_after)
def test_filter_repair(self): log_content_before = self.get_event_logs() failed_repaired_line = '2019-07-28T10:53:29+00:00 ip-10-0-167-91 !INFO | scylla.bin: [shard 0] repair - Got error in row level repair: std::runtime_error (repair id 1 is aborted on shard 0)' with DbEventsFilter(type='DATABASE_ERROR', line="repair's stream failed: streaming::stream_exception"), \ DbEventsFilter(type='RUNTIME_ERROR', line='Can not find stream_manager'), \ DbEventsFilter(type='RUNTIME_ERROR', line='is aborted'): DatabaseLogEvent(type="RUNTIME_ERROR", regex="B").add_info_and_publish( node="A", line_number=22, line=failed_repaired_line) DatabaseLogEvent(type="RUNTIME_ERROR", regex="B").add_info_and_publish( node="A", line_number=22, line=failed_repaired_line) DatabaseLogEvent(type="NO_SPACE_ERROR", regex="B").add_info_and_publish( node="B", line_number=22, line="not filtered") self.wait_for_event_log_change(log_content_before) log_content_after = self.get_event_logs() self.assertIn("not filtered", log_content_after) self.assertNotIn("repair id 1", log_content_after)
def restart(self): # We differenciate between "Restart" and "Reboot". # Restart in AWS will be a Stop and Start of an instance. # When using storage optimized instances like i2 or i3, the data on disk is deleted upon STOP. therefore, we # need to setup the instance and treat it as a new instance. if self._instance.spot_instance_request_id: LOGGER.debug("target node is spot instance, impossible to stop this instance, skipping the restart") return event_filters = () if any(ss in self._instance.instance_type for ss in ['i3', 'i2']): # since there's no disk yet in those type, lots of the errors here are acceptable, and we'll ignore them event_filters = DbEventsFilter(type="DATABASE_ERROR", node=self), \ DbEventsFilter(type="SCHEMA_FAILURE", node=self), \ DbEventsFilter(type="NO_SPACE_ERROR", node=self), \ DbEventsFilter(type="FILESYSTEM_ERROR", node=self), \ DbEventsFilter(type="RUNTIME_ERROR", node=self) clean_script = dedent(""" sudo sed -e '/.*scylla/s/^/#/g' -i /etc/fstab sudo sed -e '/auto_bootstrap:.*/s/False/True/g' -i /etc/scylla/scylla.yaml """) self.remoter.run("sudo bash -cxe '%s'" % clean_script) output = self.remoter.run('sudo grep replace_address: /etc/scylla/scylla.yaml', ignore_status=True) if 'replace_address_first_boot:' not in output.stdout: self.remoter.run('echo replace_address_first_boot: %s |sudo tee --append /etc/scylla/scylla.yaml' % self._instance.private_ip_address) self._instance.stop() self._instance_wait_safe(self._instance.wait_until_stopped) self._instance.start() self._instance_wait_safe(self._instance.wait_until_running) self._wait_public_ip() self.log.debug('Got new public IP %s', self._instance.public_ip_address) self.remoter.hostname = self.external_address self.wait_ssh_up() if any(ss in self._instance.instance_type for ss in ['i3', 'i2']): try: self.stop_scylla_server(verify_down=False) # the scylla_create_devices has been moved to the '/opt/scylladb' folder in the master branch for create_devices_file in ['/usr/lib/scylla/scylla-ami/scylla_create_devices', '/opt/scylladb/scylla-ami/scylla_create_devices', '/opt/scylladb/scylla-machine-image/scylla_create_devices']: result = self.remoter.run('sudo test -e %s' % create_devices_file, ignore_status=True) if result.exit_status == 0: self.remoter.run('sudo %s' % create_devices_file) break else: raise IOError('scylla_create_devices file isn\'t found') self.start_scylla_server(verify_up=False) self.remoter.run( 'sudo sed -i -e "s/^replace_address_first_boot:/# replace_address_first_boot:/g" /etc/scylla/scylla.yaml') self.remoter.run("sudo sed -e '/auto_bootstrap:.*/s/True/False/g' -i /etc/scylla/scylla.yaml") finally: if event_filters: for event_filter in event_filters: event_filter.cancel_filter()
def test_failed_stall_during_filter(self): # pylint: disable=no-self-use with DbEventsFilter(type="NO_SPACE_ERROR"), \ DbEventsFilter(type='BACKTRACE', line='No space left on device'): event = DatabaseLogEvent(type="REACTOR_STALLED", regex="B") event.add_info_and_publish(node="A", line_number=22, line="[99.80.124.204] [stdout] Mar 31 09:08:10 warning| reactor stall 20") logging.info(event.severity) assert event.severity == Severity.ERROR, event.severity
def test_filter(self): with DbEventsFilter(type="NO_SPACE_ERROR"), DbEventsFilter(type='BACKTRACE', line='No space left on device'): DatabaseLogEvent(type="NO_SPACE_ERROR", regex="B").add_info_and_publish(node="A", line_number=22, line="[99.80.124.204] [stdout] Mar 31 09:08:10 warning| [shard 8] commitlog - Exception in segment reservation: storage_io_error (Storage I/O error: 28: No space left on device)") DatabaseLogEvent(type="NO_SPACE_ERROR", regex="B").add_info_and_publish(node="A", line_number=22, line="[99.80.124.204] [stdout] Mar 31 09:08:10 warning| [shard 8] commitlog - Exception in segment reservation: storage_io_error (Storage I/O error: 28: No space left on device)") DatabaseLogEvent(type="NO_SPACE_ERROR", regex="B").add_info_and_publish(node="A", line_number=22, line="[99.80.124.204] [stdout] Mar 31 09:08:10 warning| [shard 8] commitlog - Exception in segment reservation: storage_io_error (Storage I/O error: 28: No space left on device)") DatabaseLogEvent(type="NO_SPACE_ERROR", regex="B").add_info_and_publish(node="A", line_number=22, line="[99.80.124.204] [stdout] Mar 31 09:08:10 warning| [shard 8] commitlog - Exception in segment reservation: storage_io_error (Storage I/O error: 28: No space left on device)")
def ignore_upgrade_schema_errors(): with ExitStack() as stack: stack.enter_context( DbEventsFilter(type='DATABASE_ERROR', line='Failed to load schema')) stack.enter_context( DbEventsFilter(type='SCHEMA_FAILURE', line='Failed to load schema')) stack.enter_context( DbEventsFilter(type='DATABASE_ERROR', line='Failed to pull schema')) stack.enter_context( DbEventsFilter(type='RUNTIME_ERROR', line='Failed to load schema')) yield
def restart(self): # We differenciate between "Restart" and "Reboot". # Restart in AWS will be a Stop and Start of an instance. # When using storage optimized instances like i2 or i3, the data on disk is deleted upon STOP. therefore, we # need to setup the instance and treat it as a new instance. if self._instance.spot_instance_request_id: logger.debug( "target node is spot instance, impossible to stop this instance, skipping the restart" ) return event_filters = () if any(ss in self._instance.instance_type for ss in ['i3', 'i2']): # since there's no disk yet in those type, lots of the errors here are acceptable, and we'll ignore them event_filters = DbEventsFilter(type="DATABASE_ERROR"), DbEventsFilter(type="SCHEMA_FAILURE"), \ DbEventsFilter(type="NO_SPACE_ERROR"), DbEventsFilter(type="FILESYSTEM_ERROR") clean_script = dedent(""" sudo sed -e '/.*scylla/s/^/#/g' -i /etc/fstab sudo sed -e '/auto_bootstrap:.*/s/False/True/g' -i /etc/scylla/scylla.yaml """) self.remoter.run("sudo bash -cxe '%s'" % clean_script) output = self.remoter.run( 'sudo grep replace_address: /etc/scylla/scylla.yaml', ignore_status=True) if 'replace_address_first_boot:' not in output.stdout: self.remoter.run( 'sudo echo replace_address_first_boot: %s >> /etc/scylla/scylla.yaml' % self._instance.private_ip_address) self._instance.stop() self._instance_wait_safe(self._instance.wait_until_stopped) self._instance.start() self._instance_wait_safe(self._instance.wait_until_running) self._wait_public_ip() self.log.debug('Got new public IP %s', self._instance.public_ip_address) self.remoter.hostname = self._instance.public_ip_address self.wait_ssh_up() if any(ss in self._instance.instance_type for ss in ['i3', 'i2']): try: self.remoter.run( 'sudo /usr/lib/scylla/scylla-ami/scylla_create_devices') self.stop_scylla_server(verify_down=False) self.start_scylla_server(verify_up=False) finally: if event_filters: for event_filter in event_filters: event_filter.cancel_filter()
def test_filter_expiration(self): log_content_before = self.get_event_logs() line_prefix = datetime.datetime.utcnow().strftime( "%Y-%m-%dT%H:%M:%S+00:00") with DbEventsFilter(type="NO_SPACE_ERROR", node="A"): DatabaseLogEvent(type="NO_SPACE_ERROR", regex="A").add_info_and_publish( node="A", line_number=22, line=line_prefix + " this is filtered") time.sleep(5) DatabaseLogEvent(type="NO_SPACE_ERROR", regex="A").add_info_and_publish(node="A", line_number=22, line=line_prefix + " this is filtered") time.sleep(5) line_prefix = datetime.datetime.utcnow().strftime( "%Y-%m-%dT%H:%M:%S+00:00") print(line_prefix) DatabaseLogEvent(type="NO_SPACE_ERROR", regex="A").add_info_and_publish(node="A", line_number=22, line=line_prefix + " not filtered") self.wait_for_event_log_change(log_content_before) log_content_after = self.get_event_logs() self.assertIn("not filtered", log_content_after) self.assertNotIn("this is filtered", log_content_after)
def test_filter_by_node(self): log_content_before = self.get_event_log_file('events.log') with DbEventsFilter(type="NO_SPACE_ERROR", node="A"): DatabaseLogEvent(type="NO_SPACE_ERROR", regex="B").add_info_and_publish(node="A", line_number=22, line="this is filtered") DatabaseLogEvent(type="NO_SPACE_ERROR", regex="B").add_info_and_publish(node="B", line_number=22, line="not filtered") log_content_after = self.wait_for_event_log_change('events.log', log_content_before) self.assertIn("not filtered", log_content_after) self.assertNotIn("this is filtered", log_content_after)
def test_filter_upgrade(self): log_content_before = self.get_event_log_file('events.log') known_failure_line = '!ERR | scylla: [shard 3] storage_proxy - Exception when communicating with 10.142.0.56: std::runtime_error (Failed to load schema version b40e405f-462c-38f2-a90c-6f130ddbf6f3)' with DbEventsFilter(type='RUNTIME_ERROR', line='Failed to load schema'): DatabaseLogEvent(type="RUNTIME_ERROR", regex="B").add_info_and_publish(node="A", line_number=22, line=known_failure_line) DatabaseLogEvent(type="RUNTIME_ERROR", regex="B").add_info_and_publish(node="A", line_number=22, line=known_failure_line) DatabaseLogEvent(type="NO_SPACE_ERROR", regex="B").add_info_and_publish(node="B", line_number=22, line="not filtered") log_content_after = self.wait_for_event_log_change('events.log', log_content_before) self.assertIn("not filtered", log_content_after) self.assertNotIn("Exception when communicating", log_content_after)
def test_rolling_upgrade(self): # pylint: disable=too-many-locals,too-many-statements """ Upgrade half of nodes in the cluster, and start special read workload during the stage. Checksum method is changed to xxhash from Scylla 2.2, we want to use this case to verify the read (cl=ALL) workload works well, upgrade all nodes to new version in the end. """ # In case the target version >= 3.1 we need to perform test for truncate entries target_upgrade_version = self.params.get('target_upgrade_version', default='') self.truncate_entries_flag = False if target_upgrade_version and parse_version(target_upgrade_version) >= parse_version('3.1') and \ not is_enterprise(target_upgrade_version): self.truncate_entries_flag = True with self.subTest('pre-test - prepare test keyspaces and tables'): # prepare test keyspaces and tables before upgrade to avoid schema change during mixed cluster. self.prepare_keyspaces_and_tables() self.fill_and_verify_db_data('BEFORE UPGRADE', pre_fill=True) # write workload during entire test self.log.info('Starting c-s write workload during entire test') write_stress_during_entire_test = self.params.get( 'write_stress_during_entire_test') entire_write_cs_thread_pool = self.run_stress_thread( stress_cmd=write_stress_during_entire_test) # Prepare keyspace and tables for truncate test if self.truncate_entries_flag: self.insert_rows = 10 self.fill_db_data_for_truncate_test( insert_rows=self.insert_rows) with self.subTest('pre-test - Run stress workload before upgrade'): # complex workload: prepare write self.log.info('Starting c-s complex workload (5M) to prepare data') stress_cmd_complex_prepare = self.params.get( 'stress_cmd_complex_prepare') complex_cs_thread_pool = self.run_stress_thread( stress_cmd=stress_cmd_complex_prepare, profile='data_dir/complex_schema.yaml') # wait for the complex workload to finish self.verify_stress_thread(complex_cs_thread_pool) # generate random order to upgrade nodes_num = len(self.db_cluster.nodes) # prepare an array containing the indexes indexes = list(range(nodes_num)) # shuffle it so we will upgrade the nodes in a # random order random.shuffle(indexes) # prepare write workload self.log.info('Starting c-s prepare write workload (n=10000000)') prepare_write_stress = self.params.get('prepare_write_stress') prepare_write_cs_thread_pool = self.run_stress_thread( stress_cmd=prepare_write_stress) self.log.info( 'Sleeping for 60s to let cassandra-stress start before the upgrade...' ) time.sleep(60) with DbEventsFilter(type='DATABASE_ERROR', line='Failed to load schema'), \ DbEventsFilter(type='SCHEMA_FAILURE', line='Failed to load schema'), \ DbEventsFilter(type='DATABASE_ERROR', line='Failed to pull schema'), \ DbEventsFilter(type='RUNTIME_ERROR', line='Failed to load schema'): step = 'Step1 - Upgrade First Node ' with self.subTest(step): # upgrade first node self.db_cluster.node_to_upgrade = self.db_cluster.nodes[ indexes[0]] self.log.info('Upgrade Node %s begin', self.db_cluster.node_to_upgrade.name) self.upgrade_node(self.db_cluster.node_to_upgrade) self.log.info('Upgrade Node %s ended', self.db_cluster.node_to_upgrade.name) self.db_cluster.node_to_upgrade.check_node_health() # wait for the prepare write workload to finish self.verify_stress_thread(prepare_write_cs_thread_pool) # read workload (cl=QUORUM) self.log.info( 'Starting c-s read workload (cl=QUORUM n=10000000)') stress_cmd_read_cl_quorum = self.params.get( 'stress_cmd_read_cl_quorum') read_stress_queue = self.run_stress_thread( stress_cmd=stress_cmd_read_cl_quorum) # wait for the read workload to finish self.verify_stress_thread(read_stress_queue) self.fill_and_verify_db_data('after upgraded one node') self.search_for_idx_token_error_after_upgrade( node=self.db_cluster.node_to_upgrade, step=step + ' - after upgraded one node') # read workload self.log.info('Starting c-s read workload for 10m') stress_cmd_read_10m = self.params.get('stress_cmd_read_10m') read_10m_cs_thread_pool = self.run_stress_thread( stress_cmd=stress_cmd_read_10m) self.log.info( 'Sleeping for 60s to let cassandra-stress start before the upgrade...' ) time.sleep(60) step = 'Step2 - Upgrade Second Node ' with self.subTest(step): # upgrade second node self.db_cluster.node_to_upgrade = self.db_cluster.nodes[ indexes[1]] self.log.info('Upgrade Node %s begin', self.db_cluster.node_to_upgrade.name) self.upgrade_node(self.db_cluster.node_to_upgrade) self.log.info('Upgrade Node %s ended', self.db_cluster.node_to_upgrade.name) self.db_cluster.node_to_upgrade.check_node_health() # wait for the 10m read workload to finish self.verify_stress_thread(read_10m_cs_thread_pool) self.fill_and_verify_db_data('after upgraded two nodes') self.search_for_idx_token_error_after_upgrade( node=self.db_cluster.node_to_upgrade, step=step + ' - after upgraded two nodes') # read workload (60m) self.log.info('Starting c-s read workload for 60m') stress_cmd_read_60m = self.params.get('stress_cmd_read_60m') read_60m_cs_thread_pool = self.run_stress_thread( stress_cmd=stress_cmd_read_60m) self.log.info( 'Sleeping for 60s to let cassandra-stress start before the rollback...' ) time.sleep(60) with self.subTest('Step3 - Rollback Second Node '): # rollback second node self.log.info('Rollback Node %s begin', self.db_cluster.nodes[indexes[1]].name) self.rollback_node(self.db_cluster.nodes[indexes[1]]) self.log.info('Rollback Node %s ended', self.db_cluster.nodes[indexes[1]].name) self.db_cluster.nodes[indexes[1]].check_node_health() step = 'Step4 - Verify data during mixed cluster mode ' with self.subTest(step): self.fill_and_verify_db_data('after rollback the second node') self.log.info('Repair the first upgraded Node') self.db_cluster.nodes[indexes[0]].run_nodetool(sub_cmd='repair') self.search_for_idx_token_error_after_upgrade( node=self.db_cluster.node_to_upgrade, step=step) with DbEventsFilter(type='DATABASE_ERROR', line='Failed to load schema'), \ DbEventsFilter(type='SCHEMA_FAILURE', line='Failed to load schema'), \ DbEventsFilter(type='DATABASE_ERROR', line='Failed to pull schema'), \ DbEventsFilter(type='RUNTIME_ERROR', line='Failed to load schema'): step = 'Step5 - Upgrade rest of the Nodes ' with self.subTest(step): for i in indexes[1:]: self.db_cluster.node_to_upgrade = self.db_cluster.nodes[i] self.log.info('Upgrade Node %s begin', self.db_cluster.node_to_upgrade.name) self.upgrade_node(self.db_cluster.node_to_upgrade) self.log.info('Upgrade Node %s ended', self.db_cluster.node_to_upgrade.name) self.db_cluster.node_to_upgrade.check_node_health() self.fill_and_verify_db_data( 'after upgraded %s' % self.db_cluster.node_to_upgrade.name) self.search_for_idx_token_error_after_upgrade( node=self.db_cluster.node_to_upgrade, step=step) with self.subTest('Step6 - Verify stress results after upgrade '): self.log.info( 'Waiting for stress threads to complete after upgrade') # wait for the 60m read workload to finish self.verify_stress_thread(read_60m_cs_thread_pool) self.verify_stress_thread(entire_write_cs_thread_pool) with self.subTest( 'Step7 - Upgrade sstables to latest supported version '): # figure out what is the last supported sstable version self.expected_sstable_format_version = self.get_highest_supported_sstable_version( ) # run 'nodetool upgradesstables' on all nodes and check/wait for all file to be upgraded upgradesstables = self.db_cluster.run_func_parallel( func=self.upgradesstables_if_command_available) # only check sstable format version if all nodes had 'nodetool upgradesstables' available if all(upgradesstables): self.log.info('Upgrading sstables if new version is available') tables_upgraded = self.db_cluster.run_func_parallel( func=self.wait_for_sstable_upgrade) assert all(tables_upgraded ), "Failed to upgrade the sstable format {}".format( tables_upgraded) # Verify sstabledump self.log.info( 'Starting sstabledump to verify correctness of sstables') self.db_cluster.nodes[0].remoter.run( 'for i in `sudo find /var/lib/scylla/data/keyspace_complex/ -type f |grep -v manifest.json |' 'grep -v snapshots |head -n 1`; do echo $i; sudo sstabledump $i 1>/tmp/sstabledump.output || ' 'exit 1; done', verbose=True) with self.subTest( 'Step8 - Run stress and verify after upgrading entire cluster ' ): self.log.info('Starting verify_stress_after_cluster_upgrade') verify_stress_after_cluster_upgrade = self.params.get( # pylint: disable=invalid-name 'verify_stress_after_cluster_upgrade') verify_stress_cs_thread_pool = self.run_stress_thread( stress_cmd=verify_stress_after_cluster_upgrade) self.verify_stress_thread(verify_stress_cs_thread_pool) # complex workload: verify data by simple read cl=ALL self.log.info( 'Starting c-s complex workload to verify data by simple read') stress_cmd_complex_verify_read = self.params.get( 'stress_cmd_complex_verify_read') complex_cs_thread_pool = self.run_stress_thread( stress_cmd=stress_cmd_complex_verify_read, profile='data_dir/complex_schema.yaml') # wait for the read complex workload to finish self.verify_stress_thread(complex_cs_thread_pool) # After adjusted the workloads, there is a entire write workload, and it uses a fixed duration for catching # the data lose. # But the execute time of workloads are not exact, so let only use basic prepare write & read verify for # complex workloads,and comment two complex workloads. # # TODO: retest commented workloads and decide to enable or delete them. # # complex workload: verify data by multiple ops # self.log.info('Starting c-s complex workload to verify data by multiple ops') # stress_cmd_complex_verify_more = self.params.get('stress_cmd_complex_verify_more') # complex_cs_thread_pool = self.run_stress_thread(stress_cmd=stress_cmd_complex_verify_more, # profile='data_dir/complex_schema.yaml') # wait for the complex workload to finish # self.verify_stress_thread(complex_cs_thread_pool) # complex workload: verify data by delete 1/10 data # self.log.info('Starting c-s complex workload to verify data by delete') # stress_cmd_complex_verify_delete = self.params.get('stress_cmd_complex_verify_delete') # complex_cs_thread_pool = self.run_stress_thread(stress_cmd=stress_cmd_complex_verify_delete, # profile='data_dir/complex_schema.yaml') # wait for the complex workload to finish # self.verify_stress_thread(complex_cs_thread_pool) # During the test we filter and ignore some specific errors, but we want to allow only certain amount of them step = 'Step9 - Search for errors that we filter during the test ' with self.subTest(step): self.log.info( 'Checking how many failed_to_load_schem errors happened during the test' ) error_factor = 3 schema_load_error_num = 0 for node in self.db_cluster.nodes: errors = node.search_database_log( search_pattern='Failed to load schema version', start_from_beginning=True, publish_events=False) schema_load_error_num += len(errors) self.search_for_idx_token_error_after_upgrade(node=node, step=step) self.log.info('schema_load_error_num: %d', schema_load_error_num) assert schema_load_error_num <= error_factor * 8 * \ len(self.db_cluster.nodes), 'Only allowing shards_num * %d schema load errors per host during the ' \ 'entire test, actual: %d' % ( error_factor, schema_load_error_num) self.log.info( 'all nodes were upgraded, and last workaround is verified.')
def ignore_no_space_errors(node): with DbEventsFilter(type='NO_SPACE_ERROR', node=node), \ DbEventsFilter(type='BACKTRACE', line='No space left on device', node=node), \ DbEventsFilter(type='DATABASE_ERROR', line='No space left on device', node=node), \ DbEventsFilter(type='FILESYSTEM_ERROR', line='No space left on device', node=node): yield