class ClusterConfigTest(RedpandaTest): def __init__(self, *args, **kwargs): rp_conf = BOOTSTRAP_CONFIG.copy() # Enable our feature flag rp_conf['enable_central_config'] = True super(ClusterConfigTest, self).__init__(*args, extra_rp_conf=rp_conf, **kwargs) self.admin = Admin(self.redpanda) self.rpk = RpkTool(self.redpanda) @cluster(num_nodes=3) def test_get_config(self): """ Verify that the config GET endpoint serves valid json with some options in it. """ admin = Admin(self.redpanda) config = admin.get_cluster_config() # Pick an arbitrary config property to verify that the result # contained some properties assert 'enable_transactions' in config node_config = admin.get_node_config() # Some arbitrary property to check syntax of result assert 'kafka_api' in node_config @cluster(num_nodes=3) def test_bootstrap(self): """ Verify that config settings present in redpanda.cfg are imported on first startup. :return: """ admin = Admin(self.redpanda) config = admin.get_cluster_config() for k, v in BOOTSTRAP_CONFIG.items(): assert config[k] == v set_again = {'enable_idempotence': False} assert BOOTSTRAP_CONFIG['enable_idempotence'] != set_again[ 'enable_idempotence'] self.redpanda.restart_nodes(self.redpanda.nodes, set_again) # Our attempt to set the value differently in the config file after first startup # should have failed: the original config value should still be set. config = admin.get_cluster_config() for k, v in BOOTSTRAP_CONFIG.items(): assert config[k] == v def _wait_for_version_sync(self, version): wait_until( lambda: set([ n['config_version'] for n in self.admin.get_cluster_config_status() ]) == {version}, timeout_sec=10, backoff_sec=0.5, err_msg=f"Config status versions did not converge on {version}") def _check_restart_clears(self): """ After changing a setting with needs_restart=true, check that nodes clear the flag after being restarted. """ status = self.admin.get_cluster_config_status() for n in status: assert n['restart'] is True first_node = self.redpanda.nodes[0] other_nodes = self.redpanda.nodes[1:] self.redpanda.restart_nodes(first_node) wait_until(lambda: self.admin.get_cluster_config_status()[0]['restart'] == False, timeout_sec=10, backoff_sec=0.5, err_msg=f"Restart flag did not clear after restart") self.redpanda.restart_nodes(other_nodes) wait_until(lambda: set( [n['restart'] for n in self.admin.get_cluster_config_status()]) == {False}, timeout_sec=10, backoff_sec=0.5, err_msg=f"Not all nodes cleared restart flag") @cluster(num_nodes=3) def test_restart(self): """ Verify that a setting requiring restart is indicated as such in status, and that status is cleared after we restart the node. """ # An arbitrary restart-requiring setting with a non-default value new_setting = ('kafka_qdc_idle_depth', 77) patch_result = self.admin.patch_cluster_config( upsert=dict([new_setting])) new_version = patch_result['config_version'] self._wait_for_version_sync(new_version) assert self.admin.get_cluster_config()[ new_setting[0]] == new_setting[1] # Update of cluster status is not synchronous self._check_restart_clears() # Test that a reset to default triggers the restart flag the same way as # an upsert does patch_result = self.admin.patch_cluster_config(remove=[new_setting[0]]) new_version = patch_result['config_version'] self._wait_for_version_sync(new_version) assert self.admin.get_cluster_config()[ new_setting[0]] != new_setting[1] self._check_restart_clears() def _check_value_everywhere(self, key, expect_value): for node in self.redpanda.nodes: actual_value = self.admin.get_cluster_config(node)[key] if actual_value != expect_value: self.logger.error( f"Wrong value on node {node.account.hostname}: {key}={actual_value} (!={expect_value})" ) assert self.admin.get_cluster_config(node)[key] == expect_value def _check_propagated_and_persistent(self, key, expect_value): """ Verify that a configuration value has successfully propagated to all nodes, and that it persists after a restart. """ self._check_value_everywhere(key, expect_value) self.redpanda.restart_nodes(self.redpanda.nodes) self._check_value_everywhere(key, expect_value) @cluster(num_nodes=3) def test_simple_live_change(self): # An arbitrary non-restart-requiring setting norestart_new_setting = ('log_message_timestamp_type', "LogAppendTime") assert self.admin.get_cluster_config()[ norestart_new_setting[0]] == "CreateTime" # Initially default patch_result = self.admin.patch_cluster_config( upsert=dict([norestart_new_setting])) new_version = patch_result['config_version'] self._wait_for_version_sync(new_version) assert self.admin.get_cluster_config()[ norestart_new_setting[0]] == norestart_new_setting[1] # Status should not indicate restart needed status = self.admin.get_cluster_config_status() for n in status: assert n['restart'] is False # Setting should be propagated and survive a restart self._check_propagated_and_persistent(norestart_new_setting[0], norestart_new_setting[1]) @cluster(num_nodes=3) def test_invalid_settings(self): default_value = "CreateTime" invalid_setting = ('log_message_timestamp_type', "rhubarb") assert self.admin.get_cluster_config()[ invalid_setting[0]] == default_value patch_result = self.admin.patch_cluster_config( upsert=dict([invalid_setting])) new_version = patch_result['config_version'] self._wait_for_version_sync(new_version) assert self.admin.get_cluster_config()[ invalid_setting[0]] == default_value # Status should not indicate restart needed status = self.admin.get_cluster_config_status() for n in status: assert n['restart'] is False assert n['invalid'] == [invalid_setting[0]] # List of invalid properties in node status should not clear on restart. self.redpanda.restart_nodes(self.redpanda.nodes) # We have to sleep here because in the success case there is no status update # being sent: it's a no-op after node startup when they realize their config # status is the same as the one already reported. time.sleep(10) status = self.admin.get_cluster_config_status() for n in status: assert n['restart'] is False assert n['invalid'] == [invalid_setting[0]] # Reset the properties, check that it disappears from the list of invalid settings patch_result = self.admin.patch_cluster_config( remove=[invalid_setting[0]]) self._wait_for_version_sync(patch_result['config_version']) assert self.admin.get_cluster_config()[ invalid_setting[0]] == default_value status = self.admin.get_cluster_config_status() for n in status: assert n['restart'] is False assert n['invalid'] == [] # TODO once API frontend does validation, this test will need a force # flag to the API to get the invalid value past the frontend and # to the nodes where it will show up in status. That force flag # will also be important IRL if we want to enable e.g. pre-setting a config # for a future redpanda version before installing the new version. # TODO as well as specific invalid examples, do a pass across the whole # schema to check that pass @cluster(num_nodes=3) def test_bad_requests(self): """ Verify that syntactically malformed configuration requests result in proper 400 responses (rather than 500s or crashes) """ for content_type, body in [ ('text/html', ""), # Wrong type, empty ('text/html', "garbage"), # Wrong type, nonempty ('application/json', ""), # Empty ('application/json', "garbage"), # Not JSON ('application/json', "{\"a\": 123}"), # Wrong top level attributes ('application/json', "{\"upsert\": []}"), # Wrong type of 'upsert' ]: try: self.logger.info(f"Checking {content_type}, {body}") self.admin._request("PUT", "cluster_config", node=self.redpanda.nodes[0], headers={'content-type': content_type}, data=body) except requests.exceptions.HTTPError as e: assert e.response.status_code == 400 else: # Should not succeed! assert False @cluster(num_nodes=3) def test_valid_settings(self): # TODO pass @cluster(num_nodes=3) def test_valid_settings(self): """ Bulk exercise of all config settings & the schema endpoint: - for all properties in the schema, set them with a valid non-default value - check the new values are reflected in config GET - restart all nodes (prompt a reload from cache file) - check the new values are reflected in config GET This is not just checking the central config infrastructure: it's also validating that all the property types are outputting the same format as their input (e.g. they have proper rjson_serialize implementations) """ schema_properties = self.admin.get_cluster_config_schema( )['properties'] updates = {} properties_require_restart = False # Don't change these settings, they prevent the test from subsequently # using the cluster exclude_settings = {'enable_sasl', 'enable_admin_api'} initial_config = self.admin.get_cluster_config() for name, p in schema_properties.items(): if name in exclude_settings: continue properties_require_restart |= p['needs_restart'] initial_value = initial_config[name] if 'example' in p: valid_value = p['example'] elif p['type'] == 'integer': if initial_value: valid_value = initial_value * 2 else: valid_value = 100 elif p['type'] == 'number': if initial_value: valid_value = float(initial_value * 2) else: valid_value = 1000.0 elif p['type'] == 'string': valid_value = "rhubarb" elif p['type'] == 'boolean': valid_value = not initial_config[name] elif p['type'] == "array" and p['items']['type'] == 'string': valid_value = ["custard", "cream"] else: raise NotImplementedError(p['type']) updates[name] = valid_value patch_result = self.admin.patch_cluster_config(upsert=updates, remove=[]) self._wait_for_version_sync(patch_result['config_version']) def check_status(expect_restart): # Use one node's status, they should be symmetric status = self.admin.get_cluster_config_status()[0] self.logger.info(f"Status: {json.dumps(status, indent=2)}") assert status['invalid'] == [] assert status['restart'] is expect_restart def check_values(): read_back = self.admin.get_cluster_config() mismatch = [] for k, expect in updates.items(): actual = read_back.get(k, None) # String-ized comparison, because the example values are strings, # whereas by the time we read them back they're properly typed. if str(actual) != str(expect): self.logger.error( f"Config set failed ({k}) {actual}!={expect}") mismatch.append((k, actual, expect)) assert len(mismatch) == 0 check_status(properties_require_restart) check_values() self.redpanda.restart_nodes(self.redpanda.nodes) # We have to sleep here because in the success case there is no status update # being sent: it's a no-op after node startup when they realize their config # status is the same as the one already reported. time.sleep(10) # Check after restart that confuration persisted and status shows valid check_status(False) check_values() def _export(self, all): with tempfile.NamedTemporaryFile('r') as file: self.rpk.cluster_config_export(file.name, all) return file.read() def _import(self, text, all, allow_noop=False): with tempfile.NamedTemporaryFile('w') as file: file.write(text) file.flush() import_stdout = self.rpk.cluster_config_import(file.name, all) last_line = import_stdout.strip().split("\n")[-1] m = re.match("^.+new config version (\d+).*$", last_line) self.logger.debug(f"_import status: {last_line}") if m is None and allow_noop: return None elif m is None: assert m is not None version = int(m.group(1)) return version def _export_import_modify(self, before, after, all=False): text = self._export(all) # Validate that RPK gives us valid yaml _ = yaml.load(text) self.logger.debug(f"Replacing \"{before}\" with \"{after}\"") self.logger.debug(f"Exported config before modification: {text}") # Intentionally not passing this through a YAML deserialize/serialize # step during edit, to more realistically emulate someone hand editing text = text.replace(before, after) self.logger.debug(f"Exported config after modification: {text}") # Edit a setting, import the resulting document version = self._import(text, all) return version, text @cluster(num_nodes=3) def test_rpk_export_import(self): """ Test `rpk cluster config [export|import]` and implicitly also `edit` (which is just an export/import cycle with a text editor run in the middle) """ # An arbitrary tunable for checking --all tunable_property = 'kafka_qdc_depth_alpha' # RPK should give us a valid yaml document version_a, text = self._export_import_modify("kafka_qdc_enable: false", "kafka_qdc_enable: true") self._wait_for_version_sync(version_a) # Default should not have included tunables assert tunable_property not in text # The setting we edited should be updated self._check_value_everywhere("kafka_qdc_enable", True) # Clear a setting, it should revert to its default version_b, text = self._export_import_modify("kafka_qdc_enable: true", "") assert version_b > version_a self._wait_for_version_sync(version_b) self._check_value_everywhere("kafka_qdc_enable", False) # Check that an --all export includes tunables text_all = self._export(all=True) assert tunable_property in text_all # Check that editing a tunable with --all works version_c, text = self._export_import_modify( "kafka_qdc_depth_alpha: 0.8", "kafka_qdc_depth_alpha: 1.5", all=True) assert version_c > version_b self._wait_for_version_sync(version_c) self._check_value_everywhere("kafka_qdc_depth_alpha", 1.5) # Check that clearing a tunable with --all works version_d, text = self._export_import_modify( "kafka_qdc_depth_alpha: 1.5", "", all=True) assert version_d > version_c self._wait_for_version_sync(version_d) self._check_value_everywhere("kafka_qdc_depth_alpha", 0.8) # Check that an import/export with no edits does nothing. text = self._export(all=True) noop_version = self._import(text, allow_noop=True, all=True) assert noop_version is None @cluster(num_nodes=3) def test_rpk_edit_string(self): """ Test import/export of string fields, make sure they don't end up with extraneous quotes """ version_a, text = self._export_import_modify( "cloud_storage_access_key:\n", "cloud_storage_access_key: foobar\n") self._wait_for_version_sync(version_a) self._check_value_everywhere("cloud_storage_access_key", "foobar") version_b, text = self._export_import_modify( "cloud_storage_access_key: foobar\n", "cloud_storage_access_key: \"foobaz\"") self._wait_for_version_sync(version_b) self._check_value_everywhere("cloud_storage_access_key", "foobaz") @cluster(num_nodes=3) def test_rpk_status(self): """ This command is a thin wrapper over the status API that is covered more comprehensively in other tests: this case is just a superficial test that the command succeeds and returns info for each node. """ status_text = self.rpk.cluster_config_status() # Split into lines, skip first one (header) lines = status_text.strip().split("\n")[1:] # Example: # NODE CONFIG_VERSION NEEDS_RESTART INVALID UNKNOWN # 0 17 false [] [] assert len(lines) == len(self.redpanda.nodes) for i, l in enumerate(lines): m = re.match( "^(\d+)\s+(\d+)\s+(true|false)\s+\[(.*)\]\s+\[(.*)\]$", l) assert m is not None node_id, config_version, needs_restart, invalid, unknown = m.groups( ) node = self.redpanda.nodes[i] assert int(node_id) == self.redpanda.idx(node)
class ClusterConfigTest(RedpandaTest): def __init__(self, *args, **kwargs): rp_conf = BOOTSTRAP_CONFIG.copy() # Enable our feature flag rp_conf['enable_central_config'] = True super(ClusterConfigTest, self).__init__(*args, extra_rp_conf=rp_conf, **kwargs) self.admin = Admin(self.redpanda) self.rpk = RpkTool(self.redpanda) @cluster(num_nodes=3) def test_get_config(self): """ Verify that the config GET endpoint serves valid json with some options in it. """ admin = Admin(self.redpanda) config = admin.get_cluster_config() # Pick an arbitrary config property to verify that the result # contained some properties assert 'enable_transactions' in config node_config = admin.get_node_config() # Some arbitrary property to check syntax of result assert 'kafka_api' in node_config @cluster(num_nodes=3) def test_bootstrap(self): """ Verify that config settings present in redpanda.cfg are imported on first startup. :return: """ admin = Admin(self.redpanda) config = admin.get_cluster_config() for k, v in BOOTSTRAP_CONFIG.items(): assert config[k] == v set_again = {'enable_idempotence': False} assert BOOTSTRAP_CONFIG['enable_idempotence'] != set_again[ 'enable_idempotence'] self.redpanda.restart_nodes(self.redpanda.nodes, set_again) # Our attempt to set the value differently in the config file after first startup # should have failed: the original config value should still be set. config = admin.get_cluster_config() for k, v in BOOTSTRAP_CONFIG.items(): assert config[k] == v def _wait_for_version_sync(self, version): wait_until( lambda: set([ n['config_version'] for n in self.admin.get_cluster_config_status() ]) == {version}, timeout_sec=10, backoff_sec=0.5, err_msg=f"Config status versions did not converge on {version}") def _check_restart_clears(self): """ After changing a setting with needs_restart=true, check that nodes clear the flag after being restarted. """ status = self.admin.get_cluster_config_status() for n in status: assert n['restart'] is True first_node = self.redpanda.nodes[0] other_nodes = self.redpanda.nodes[1:] self.redpanda.restart_nodes(first_node) wait_until(lambda: self.admin.get_cluster_config_status()[0]['restart'] == False, timeout_sec=10, backoff_sec=0.5, err_msg=f"Restart flag did not clear after restart") self.redpanda.restart_nodes(other_nodes) wait_until(lambda: set( [n['restart'] for n in self.admin.get_cluster_config_status()]) == {False}, timeout_sec=10, backoff_sec=0.5, err_msg=f"Not all nodes cleared restart flag") @cluster(num_nodes=3) def test_restart(self): """ Verify that a setting requiring restart is indicated as such in status, and that status is cleared after we restart the node. """ # An arbitrary restart-requiring setting with a non-default value new_setting = ('kafka_qdc_idle_depth', 77) patch_result = self.admin.patch_cluster_config( upsert=dict([new_setting])) new_version = patch_result['config_version'] self._wait_for_version_sync(new_version) assert self.admin.get_cluster_config()[ new_setting[0]] == new_setting[1] # Update of cluster status is not synchronous self._check_restart_clears() # Test that a reset to default triggers the restart flag the same way as # an upsert does patch_result = self.admin.patch_cluster_config(remove=[new_setting[0]]) new_version = patch_result['config_version'] self._wait_for_version_sync(new_version) assert self.admin.get_cluster_config()[ new_setting[0]] != new_setting[1] self._check_restart_clears() @cluster(num_nodes=3) def test_multistring_restart(self): """ Reproduce an issue where the key we edit is saved correctly, but other cached keys are getting extra-quoted. """ # Initially set both values together patch_result = self.admin.patch_cluster_config( upsert={ "cloud_storage_access_key": "user", "cloud_storage_secret_key": "pass" }) self._wait_for_version_sync(patch_result['config_version']) self._check_value_everywhere("cloud_storage_access_key", "user") self._check_value_everywhere("cloud_storage_secret_key", "pass") # Check initially set values survive a restart self.redpanda.restart_nodes(self.redpanda.nodes) self._check_value_everywhere("cloud_storage_access_key", "user") self._check_value_everywhere("cloud_storage_secret_key", "pass") # Set just one of the values patch_result = self.admin.patch_cluster_config( upsert={"cloud_storage_access_key": "user2"}) self._wait_for_version_sync(patch_result['config_version']) self._check_value_everywhere("cloud_storage_access_key", "user2") self._check_value_everywhere("cloud_storage_secret_key", "pass") # Check that the recently set value persists, AND the originally # set value of another property is not corrupted. self.redpanda.restart_nodes(self.redpanda.nodes) self._check_value_everywhere("cloud_storage_access_key", "user2") self._check_value_everywhere("cloud_storage_secret_key", "pass") def _check_value_everywhere(self, key, expect_value): for node in self.redpanda.nodes: actual_value = self.admin.get_cluster_config(node)[key] if actual_value != expect_value: self.logger.error( f"Wrong value on node {node.account.hostname}: {key}={actual_value} (!={expect_value})" ) assert self.admin.get_cluster_config(node)[key] == expect_value def _check_propagated_and_persistent(self, key, expect_value): """ Verify that a configuration value has successfully propagated to all nodes, and that it persists after a restart. """ self._check_value_everywhere(key, expect_value) self.redpanda.restart_nodes(self.redpanda.nodes) self._check_value_everywhere(key, expect_value) @cluster(num_nodes=3) def test_simple_live_change(self): # An arbitrary non-restart-requiring setting norestart_new_setting = ('log_message_timestamp_type', "LogAppendTime") assert self.admin.get_cluster_config()[ norestart_new_setting[0]] == "CreateTime" # Initially default patch_result = self.admin.patch_cluster_config( upsert=dict([norestart_new_setting])) new_version = patch_result['config_version'] self._wait_for_version_sync(new_version) assert self.admin.get_cluster_config()[ norestart_new_setting[0]] == norestart_new_setting[1] # Status should not indicate restart needed status = self.admin.get_cluster_config_status() for n in status: assert n['restart'] is False # Setting should be propagated and survive a restart self._check_propagated_and_persistent(norestart_new_setting[0], norestart_new_setting[1]) @cluster(num_nodes=3) @parametrize(key='log_message_timestamp_type', value="rhubarb") @parametrize(key='log_message_timestamp_type', value="31415") @parametrize(key='log_message_timestamp_type', value="false") @parametrize(key='kafka_qdc_enable', value="rhubarb") @parametrize(key='kafka_qdc_enable', value="31415") @parametrize(key='metadata_dissemination_retries', value="rhubarb") @parametrize(key='metadata_dissemination_retries', value="false") @parametrize(key='it_does_not_exist', value="123") def test_invalid_settings(self, key, value): """ Test that without force=true, attempts to set invalid property values are rejected with a 400 status. """ try: patch_result = self.admin.patch_cluster_config(upsert={key: value}) except requests.exceptions.HTTPError as e: if e.response.status_code != 400: raise else: raise RuntimeError( f"Expected 400 but got {patch_result} for {key}={value})") @cluster(num_nodes=3) def test_invalid_settings_forced(self): """ Test that if a value makes it past the frontend API validation, it is caught at the point of apply on each node, and fed back in the config_status. """ default_value = "CreateTime" invalid_setting = ('log_message_timestamp_type', "rhubarb") assert self.admin.get_cluster_config()[ invalid_setting[0]] == default_value patch_result = self.admin.patch_cluster_config(upsert=dict( [invalid_setting]), force=True) new_version = patch_result['config_version'] self._wait_for_version_sync(new_version) assert self.admin.get_cluster_config()[ invalid_setting[0]] == default_value # Status should not indicate restart needed status = self.admin.get_cluster_config_status() for n in status: assert n['restart'] is False assert n['invalid'] == [invalid_setting[0]] # List of invalid properties in node status should not clear on restart. self.redpanda.restart_nodes(self.redpanda.nodes) # We have to sleep here because in the success case there is no status update # being sent: it's a no-op after node startup when they realize their config # status is the same as the one already reported. time.sleep(10) status = self.admin.get_cluster_config_status() for n in status: assert n['restart'] is False assert n['invalid'] == [invalid_setting[0]] # Reset the properties, check that it disappears from the list of invalid settings patch_result = self.admin.patch_cluster_config( remove=[invalid_setting[0]], force=True) self._wait_for_version_sync(patch_result['config_version']) assert self.admin.get_cluster_config()[ invalid_setting[0]] == default_value status = self.admin.get_cluster_config_status() for n in status: assert n['restart'] is False assert n['invalid'] == [] # TODO as well as specific invalid examples, do a pass across the whole # schema to check that pass @cluster(num_nodes=3) def test_bad_requests(self): """ Verify that syntactically malformed configuration requests result in proper 400 responses (rather than 500s or crashes) """ for content_type, body in [ ('text/html', ""), # Wrong type, empty ('text/html', "garbage"), # Wrong type, nonempty ('application/json', ""), # Empty ('application/json', "garbage"), # Not JSON ('application/json', "{\"a\": 123}"), # Wrong top level attributes ('application/json', "{\"upsert\": []}"), # Wrong type of 'upsert' ]: try: self.logger.info(f"Checking {content_type}, {body}") self.admin._request("PUT", "cluster_config", node=self.redpanda.nodes[0], headers={'content-type': content_type}, data=body) except requests.exceptions.HTTPError as e: assert e.response.status_code == 400 else: # Should not succeed! assert False @cluster(num_nodes=3) def test_valid_settings(self): """ Bulk exercise of all config settings & the schema endpoint: - for all properties in the schema, set them with a valid non-default value - check the new values are reflected in config GET - restart all nodes (prompt a reload from cache file) - check the new values are reflected in config GET This is not just checking the central config infrastructure: it's also validating that all the property types are outputting the same format as their input (e.g. they have proper rjson_serialize implementations) """ schema_properties = self.admin.get_cluster_config_schema( )['properties'] updates = {} properties_require_restart = False # Don't change these settings, they prevent the test from subsequently # using the cluster exclude_settings = {'enable_sasl', 'enable_admin_api'} initial_config = self.admin.get_cluster_config() for name, p in schema_properties.items(): if name in exclude_settings: continue properties_require_restart |= p['needs_restart'] initial_value = initial_config[name] if 'example' in p: valid_value = p['example'] elif p['type'] == 'integer': if initial_value: valid_value = initial_value * 2 else: valid_value = 100 elif p['type'] == 'number': if initial_value: valid_value = float(initial_value * 2) else: valid_value = 1000.0 elif p['type'] == 'string': valid_value = "rhubarb" elif p['type'] == 'boolean': valid_value = not initial_config[name] elif p['type'] == "array" and p['items']['type'] == 'string': valid_value = ["custard", "cream"] else: raise NotImplementedError(p['type']) updates[name] = valid_value patch_result = self.admin.patch_cluster_config(upsert=updates, remove=[]) self._wait_for_version_sync(patch_result['config_version']) def check_status(expect_restart): # Use one node's status, they should be symmetric status = self.admin.get_cluster_config_status()[0] self.logger.info(f"Status: {json.dumps(status, indent=2)}") assert status['invalid'] == [] assert status['restart'] is expect_restart def check_values(): read_back = self.admin.get_cluster_config() mismatch = [] for k, expect in updates.items(): actual = read_back.get(k, None) # String-ized comparison, because the example values are strings, # whereas by the time we read them back they're properly typed. if str(actual) != str(expect): self.logger.error( f"Config set failed ({k}) {actual}!={expect}") mismatch.append((k, actual, expect)) assert len(mismatch) == 0 check_status(properties_require_restart) check_values() self.redpanda.restart_nodes(self.redpanda.nodes) # We have to sleep here because in the success case there is no status update # being sent: it's a no-op after node startup when they realize their config # status is the same as the one already reported. time.sleep(10) # Check after restart that confuration persisted and status shows valid check_status(False) check_values() def _export(self, all): with tempfile.NamedTemporaryFile('r') as file: self.rpk.cluster_config_export(file.name, all) return file.read() def _import(self, text, all, allow_noop=False): with tempfile.NamedTemporaryFile('w') as file: file.write(text) file.flush() import_stdout = self.rpk.cluster_config_import(file.name, all) last_line = import_stdout.strip().split("\n")[-1] m = re.match(r"^.+new config version (\d+).*$", last_line) self.logger.debug(f"_import status: {last_line}") if m is None and allow_noop: return None assert m is not None, f"Config version not found: {last_line}" version = int(m.group(1)) return version def _export_import_modify(self, before, after, all=False): text = self._export(all) # Validate that RPK gives us valid yaml _ = yaml.load(text) self.logger.debug(f"Replacing \"{before}\" with \"{after}\"") self.logger.debug(f"Exported config before modification: {text}") # Intentionally not passing this through a YAML deserialize/serialize # step during edit, to more realistically emulate someone hand editing text = text.replace(before, after) self.logger.debug(f"Exported config after modification: {text}") # Edit a setting, import the resulting document version = self._import(text, all) return version, text @cluster(num_nodes=3) def test_rpk_export_import(self): """ Test `rpk cluster config [export|import]` and implicitly also `edit` (which is just an export/import cycle with a text editor run in the middle) """ # An arbitrary tunable for checking --all tunable_property = 'kafka_qdc_depth_alpha' # RPK should give us a valid yaml document version_a, text = self._export_import_modify("kafka_qdc_enable: false", "kafka_qdc_enable: true") assert version_a is not None self._wait_for_version_sync(version_a) # Default should not have included tunables assert tunable_property not in text # The setting we edited should be updated self._check_value_everywhere("kafka_qdc_enable", True) # Clear a setting, it should revert to its default version_b, text = self._export_import_modify("kafka_qdc_enable: true", "") assert version_b is not None assert version_b > version_a self._wait_for_version_sync(version_b) self._check_value_everywhere("kafka_qdc_enable", False) # Check that an --all export includes tunables text_all = self._export(all=True) assert tunable_property in text_all # Check that editing a tunable with --all works version_c, text = self._export_import_modify( "kafka_qdc_depth_alpha: 0.8", "kafka_qdc_depth_alpha: 1.5", all=True) assert version_c is not None assert version_c > version_b self._wait_for_version_sync(version_c) self._check_value_everywhere("kafka_qdc_depth_alpha", 1.5) # Check that clearing a tunable with --all works version_d, text = self._export_import_modify( "kafka_qdc_depth_alpha: 1.5", "", all=True) assert version_d is not None assert version_d > version_c self._wait_for_version_sync(version_d) self._check_value_everywhere("kafka_qdc_depth_alpha", 0.8) # Check that an import/export with no edits does nothing. text = self._export(all=True) noop_version = self._import(text, allow_noop=True, all=True) assert noop_version is None @cluster(num_nodes=3) def test_rpk_edit_string(self): """ Test import/export of string fields, make sure they don't end up with extraneous quotes """ version_a, _ = self._export_import_modify( "cloud_storage_access_key:\n", "cloud_storage_access_key: foobar\n") self._wait_for_version_sync(version_a) self._check_value_everywhere("cloud_storage_access_key", "foobar") version_b, _ = self._export_import_modify( "cloud_storage_access_key: foobar\n", "cloud_storage_access_key: \"foobaz\"") self._wait_for_version_sync(version_b) self._check_value_everywhere("cloud_storage_access_key", "foobaz") @cluster(num_nodes=3) def test_rpk_status(self): """ This command is a thin wrapper over the status API that is covered more comprehensively in other tests: this case is just a superficial test that the command succeeds and returns info for each node. """ status_text = self.rpk.cluster_config_status() # Split into lines, skip first one (header) lines = status_text.strip().split("\n")[1:] # Example: # NODE CONFIG_VERSION NEEDS_RESTART INVALID UNKNOWN # 0 17 false [] [] assert len(lines) == len(self.redpanda.nodes) for i, l in enumerate(lines): m = re.match( r"^(\d+)\s+(\d+)\s+(true|false)\s+\[(.*)\]\s+\[(.*)\]$", l) assert m is not None node_id, *_ = m.groups() node = self.redpanda.nodes[i] assert int(node_id) == self.redpanda.idx(node) @cluster(num_nodes=3) def test_secret_redaction(self): def search_log(pattern): for node in self.redpanda.nodes: for line in node.account.ssh_capture( f"grep \"{pattern}\" {self.redpanda.STDOUT_STDERR_CAPTURE} || true" ): # We got a match self.logger.debug( f"Found {pattern} on node {node.name}: {line}") return True # Fall through, no matches return False def set_and_search(key, value, expect_log): patch_result = self.admin.patch_cluster_config(upsert={key: value}) self._wait_for_version_sync(patch_result['config_version']) # Check value was/was not printed to log while applying assert search_log(value) is expect_log # Check we do/don't print on next startup self.redpanda.restart_nodes(self.redpanda.nodes) assert search_log(value) is expect_log secret_key = "cloud_storage_secret_key" secret_value = "ThePandaFliesTonight" set_and_search(secret_key, secret_value, False) # To avoid false negatives in the test of a secret, go through the same procedure # but on a non-secret property, thereby validating that our log scanning procedure # would have detected the secret if it had been printed unsecret_key = "cloud_storage_api_endpoint" unsecret_value = "http://nowhere" set_and_search(unsecret_key, unsecret_value, True) @cluster(num_nodes=3) def test_incremental_alter_configs(self): """ Central config can also be accessed via Kafka API -- exercise that using `kcl`. :param incremental: whether to use incremental kafka config API or legacy config API. """ kcl = KCL(self.redpanda) # Redpanda only support incremental config changes: the legacy # AlterConfig API is a bad user experience incremental = True # Set a property by its redpanda name out = kcl.alter_broker_config( {"log_message_timestamp_type": "CreateTime"}, incremental) # kcl does not set an error exist status when config set fails, so must # read its output text to validate that calls are successful assert 'OK' in out out = kcl.alter_broker_config( {"log_message_timestamp_type": "LogAppendTime"}, incremental) assert 'OK' in out if incremental: kcl.delete_broker_config(["log_message_timestamp_type"], incremental) assert 'OK' in out # Set a property by its Kafka-interop names and values kafka_props = { "log.message.timestamp.type": ["CreateTime", "LogAppendTime"], "log.cleanup.policy": ["compact", "delete"], "log.compression.type": ["gzip", "snappy", "lz4", "zstd"], } for property, value_list in kafka_props.items(): for value in value_list: out = kcl.alter_broker_config({property: value}, incremental) assert 'OK' in out # Set a nonexistent property out = kcl.alter_broker_config({"does_not_exist": "avalue"}, incremental) assert 'INVALID_CONFIG' in out # Set a malformed property out = kcl.alter_broker_config( {"log_message_timestamp_type": "BadValue"}, incremental) assert 'INVALID_CONFIG' in out # Set a property on a named broker: should fail because this # interface is only for cluster-wide properties out = kcl.alter_broker_config( {"log_message_timestamp_type": "CreateTime"}, incremental, broker="1") assert 'INVALID_CONFIG' in out assert "Setting broker properties on named brokers is unsupported" in out @cluster(num_nodes=3) def test_alter_configs(self): """ We only support incremental config changes. Check that AlterConfigs requests are correctly handled with an 'unsupported' response. """ kcl = KCL(self.redpanda) out = kcl.alter_broker_config( {"log_message_timestamp_type": "CreateTime"}, incremental=False) self.logger.info("AlterConfigs output: {out}") assert 'INVALID_CONFIG' in out assert "changing broker properties isn't supported via this API" in out
class ClusterConfigTest(RedpandaTest): def __init__(self, *args, **kwargs): rp_conf = BOOTSTRAP_CONFIG.copy() # Force verbose logging for the secret redaction test kwargs['log_level'] = 'trace' super(ClusterConfigTest, self).__init__(*args, extra_rp_conf=rp_conf, **kwargs) self.admin = Admin(self.redpanda) self.rpk = RpkTool(self.redpanda) def setUp(self): super().setUp() # wait for the two config versions: # 1. The initial bootstrap where we "import" any cluster properties # that were in bootstrap.yaml # 2. The metrics reporter's first tick, where it initializes # the cluster_id property. self._wait_for_version_sync(2) @cluster(num_nodes=3) @parametrize(legacy=False) @parametrize(legacy=True) def test_get_config(self, legacy): """ Verify that the config GET endpoint serves valid json with some options in it. :param legacy: whether to use the legacy /config endpoint """ admin = Admin(self.redpanda) if legacy: config = admin._request("GET", "config").json() else: config = admin.get_cluster_config() # Pick an arbitrary config property to verify that the result # contained some properties assert 'enable_transactions' in config node_config = admin.get_node_config() # Some arbitrary property to check syntax of result assert 'kafka_api' in node_config @cluster(num_nodes=1) def test_get_config_nodefaults(self): admin = Admin(self.redpanda) initial_short_config = admin.get_cluster_config(include_defaults=False) long_config = admin.get_cluster_config(include_defaults=True) assert len(long_config) > len(initial_short_config) assert 'kafka_qdc_enable' not in initial_short_config # After setting something to non-default is should appear patch_result = self.admin.patch_cluster_config( upsert={'kafka_qdc_enable': True}) self._wait_for_version_sync(patch_result['config_version']) short_config = admin.get_cluster_config(include_defaults=False) assert 'kafka_qdc_enable' in short_config assert len(short_config) == len(initial_short_config) + 1 # After resetting to default it should disappear patch_result = self.admin.patch_cluster_config( remove=['kafka_qdc_enable']) self._wait_for_version_sync(patch_result['config_version']) short_config = admin.get_cluster_config(include_defaults=False) assert 'kafka_qdc_enable' not in short_config assert len(short_config) == len(initial_short_config) @cluster(num_nodes=3) def test_bootstrap(self): """ Verify that config settings present in redpanda.cfg are imported on first startup. :return: """ admin = Admin(self.redpanda) config = admin.get_cluster_config() for k, v in BOOTSTRAP_CONFIG.items(): assert config[k] == v set_again = {'enable_idempotence': False} assert BOOTSTRAP_CONFIG['enable_idempotence'] != set_again[ 'enable_idempotence'] self.redpanda.set_extra_rp_conf(set_again) self.redpanda.write_bootstrap_cluster_config() self.redpanda.restart_nodes(self.redpanda.nodes, set_again) # Our attempt to set the value differently in the config file after first startup # should have failed: the original config value should still be set. config = admin.get_cluster_config() for k, v in BOOTSTRAP_CONFIG.items(): assert config[k] == v def _wait_for_version_sync(self, version): wait_until( lambda: set([ n['config_version'] for n in self.admin.get_cluster_config_status() ]) == {version}, timeout_sec=10, backoff_sec=0.5, err_msg=f"Config status versions did not converge on {version}") def _check_restart_clears(self): """ After changing a setting with needs_restart=true, check that nodes clear the flag after being restarted. """ status = self.admin.get_cluster_config_status() for n in status: assert n['restart'] is True first_node = self.redpanda.nodes[0] other_nodes = self.redpanda.nodes[1:] self.redpanda.restart_nodes(first_node) wait_until(lambda: self.admin.get_cluster_config_status()[0]['restart'] == False, timeout_sec=10, backoff_sec=0.5, err_msg=f"Restart flag did not clear after restart") self.redpanda.restart_nodes(other_nodes) wait_until(lambda: set( [n['restart'] for n in self.admin.get_cluster_config_status()]) == {False}, timeout_sec=10, backoff_sec=0.5, err_msg=f"Not all nodes cleared restart flag") @cluster(num_nodes=3) def test_restart(self): """ Verify that a setting requiring restart is indicated as such in status, and that status is cleared after we restart the node. """ # An arbitrary restart-requiring setting with a non-default value new_setting = ('kafka_qdc_idle_depth', 77) patch_result = self.admin.patch_cluster_config( upsert=dict([new_setting])) new_version = patch_result['config_version'] self._wait_for_version_sync(new_version) assert self.admin.get_cluster_config()[ new_setting[0]] == new_setting[1] # Update of cluster status is not synchronous self._check_restart_clears() # Test that a reset to default triggers the restart flag the same way as # an upsert does patch_result = self.admin.patch_cluster_config(remove=[new_setting[0]]) new_version = patch_result['config_version'] self._wait_for_version_sync(new_version) assert self.admin.get_cluster_config()[ new_setting[0]] != new_setting[1] self._check_restart_clears() @cluster(num_nodes=3) def test_multistring_restart(self): """ Reproduce an issue where the key we edit is saved correctly, but other cached keys are getting extra-quoted. """ # Initially set both values together patch_result = self.admin.patch_cluster_config( upsert={ "cloud_storage_access_key": "user", "cloud_storage_secret_key": "pass" }) self._wait_for_version_sync(patch_result['config_version']) self._check_value_everywhere("cloud_storage_access_key", "user") self._check_value_everywhere("cloud_storage_secret_key", "pass") # Check initially set values survive a restart self.redpanda.restart_nodes(self.redpanda.nodes) self._check_value_everywhere("cloud_storage_access_key", "user") self._check_value_everywhere("cloud_storage_secret_key", "pass") # Set just one of the values patch_result = self.admin.patch_cluster_config( upsert={"cloud_storage_access_key": "user2"}) self._wait_for_version_sync(patch_result['config_version']) self._check_value_everywhere("cloud_storage_access_key", "user2") self._check_value_everywhere("cloud_storage_secret_key", "pass") # Check that the recently set value persists, AND the originally # set value of another property is not corrupted. self.redpanda.restart_nodes(self.redpanda.nodes) self._check_value_everywhere("cloud_storage_access_key", "user2") self._check_value_everywhere("cloud_storage_secret_key", "pass") def _check_value_everywhere(self, key, expect_value): for node in self.redpanda.nodes: actual_value = self.admin.get_cluster_config(node)[key] if actual_value != expect_value: self.logger.error( f"Wrong value on node {node.account.hostname}: {key}={actual_value} (!={expect_value})" ) assert self.admin.get_cluster_config(node)[key] == expect_value def _check_propagated_and_persistent(self, key, expect_value): """ Verify that a configuration value has successfully propagated to all nodes, and that it persists after a restart. """ self._check_value_everywhere(key, expect_value) self.redpanda.restart_nodes(self.redpanda.nodes) self._check_value_everywhere(key, expect_value) @cluster(num_nodes=3) def test_simple_live_change(self): # An arbitrary non-restart-requiring setting norestart_new_setting = ('log_message_timestamp_type', "LogAppendTime") assert self.admin.get_cluster_config()[ norestart_new_setting[0]] == "CreateTime" # Initially default patch_result = self.admin.patch_cluster_config( upsert=dict([norestart_new_setting])) new_version = patch_result['config_version'] self._wait_for_version_sync(new_version) assert self.admin.get_cluster_config()[ norestart_new_setting[0]] == norestart_new_setting[1] # Status should not indicate restart needed status = self.admin.get_cluster_config_status() for n in status: assert n['restart'] is False # Setting should be propagated and survive a restart self._check_propagated_and_persistent(norestart_new_setting[0], norestart_new_setting[1]) @cluster(num_nodes=3) @parametrize(key='log_message_timestamp_type', value="rhubarb") @parametrize(key='log_message_timestamp_type', value="31415") @parametrize(key='log_message_timestamp_type', value="false") @parametrize(key='kafka_qdc_enable', value="rhubarb") @parametrize(key='kafka_qdc_enable', value="31415") @parametrize(key='metadata_dissemination_retries', value="rhubarb") @parametrize(key='metadata_dissemination_retries', value="false") @parametrize(key='it_does_not_exist', value="123") def test_invalid_settings(self, key, value): """ Test that without force=true, attempts to set invalid property values are rejected with a 400 status. """ try: patch_result = self.admin.patch_cluster_config(upsert={key: value}) except requests.exceptions.HTTPError as e: if e.response.status_code != 400: raise errors = e.response.json() assert set(errors.keys()) == {key} else: raise RuntimeError( f"Expected 400 but got {patch_result} for {key}={value})") @cluster(num_nodes=1) def test_dry_run(self): """ Verify that when the dry_run flag is used, validation is done but changes are not made. """ # An invalid PUT try: self.admin.patch_cluster_config( upsert={"log_message_timestamp_type": "rhubarb"}, dry_run=True) except requests.exceptions.HTTPError as e: if e.response.status_code != 400: raise assert set( e.response.json().keys()) == {"log_message_timestamp_type"} else: raise RuntimeError(f"Expected 400 but got success") # A valid PUT self.admin.patch_cluster_config( upsert={"log_message_timestamp_type": "LogAppendTime"}, dry_run=True) # Check the value didn't get set (i.e. remains default) self._check_value_everywhere("log_message_timestamp_type", "CreateTime") @cluster(num_nodes=3) def test_invalid_settings_forced(self): """ Test that if a value makes it past the frontend API validation, it is caught at the point of apply on each node, and fed back in the config_status. """ default_value = "CreateTime" invalid_setting = ('log_message_timestamp_type', "rhubarb") assert self.admin.get_cluster_config()[ invalid_setting[0]] == default_value patch_result = self.admin.patch_cluster_config(upsert=dict( [invalid_setting]), force=True) new_version = patch_result['config_version'] self._wait_for_version_sync(new_version) assert self.admin.get_cluster_config()[ invalid_setting[0]] == default_value # Status should not indicate restart needed status = self.admin.get_cluster_config_status() for n in status: assert n['restart'] is False assert n['invalid'] == [invalid_setting[0]] # List of invalid properties in node status should not clear on restart. self.redpanda.restart_nodes(self.redpanda.nodes) # We have to sleep here because in the success case there is no status update # being sent: it's a no-op after node startup when they realize their config # status is the same as the one already reported. time.sleep(10) status = self.admin.get_cluster_config_status() for n in status: assert n['restart'] is False assert n['invalid'] == [invalid_setting[0]] # Reset the properties, check that it disappears from the list of invalid settings patch_result = self.admin.patch_cluster_config( remove=[invalid_setting[0]], force=True) self._wait_for_version_sync(patch_result['config_version']) assert self.admin.get_cluster_config()[ invalid_setting[0]] == default_value status = self.admin.get_cluster_config_status() for n in status: assert n['restart'] is False assert n['invalid'] == [] # TODO as well as specific invalid examples, do a pass across the whole # schema to check that pass @cluster(num_nodes=3) def test_bad_requests(self): """ Verify that syntactically malformed configuration requests result in proper 400 responses (rather than 500s or crashes) """ for content_type, body in [ ('text/html', ""), # Wrong type, empty ('text/html', "garbage"), # Wrong type, nonempty ('application/json', ""), # Empty ('application/json', "garbage"), # Not JSON ('application/json', "{\"a\": 123}"), # Wrong top level attributes ('application/json', "{\"upsert\": []}"), # Wrong type of 'upsert' ]: try: self.logger.info(f"Checking {content_type}, {body}") self.admin._request("PUT", "cluster_config", node=self.redpanda.nodes[0], headers={'content-type': content_type}, data=body) except requests.exceptions.HTTPError as e: assert e.response.status_code == 400 else: # Should not succeed! assert False @cluster(num_nodes=3) def test_valid_settings(self): """ Bulk exercise of all config settings & the schema endpoint: - for all properties in the schema, set them with a valid non-default value - check the new values are reflected in config GET - restart all nodes (prompt a reload from cache file) - check the new values are reflected in config GET This is not just checking the central config infrastructure: it's also validating that all the property types are outputting the same format as their input (e.g. they have proper rjson_serialize implementations) """ schema_properties = self.admin.get_cluster_config_schema( )['properties'] updates = {} properties_require_restart = False # Don't change these settings, they prevent the test from subsequently # using the cluster exclude_settings = {'enable_sasl'} # Don't enable coproc: it generates log errors if its companion service isn't running exclude_settings.add('enable_coproc') initial_config = self.admin.get_cluster_config() for name, p in schema_properties.items(): if name in exclude_settings: continue properties_require_restart |= p['needs_restart'] initial_value = initial_config[name] if 'example' in p: valid_value = p['example'] if p['type'] == "array": valid_value = yaml.load(valid_value) elif p['type'] == 'integer': if initial_value: valid_value = initial_value * 2 else: valid_value = 100 elif p['type'] == 'number': if initial_value: valid_value = float(initial_value * 2) else: valid_value = 1000.0 elif p['type'] == 'string': if name.endswith("_url"): valid_value = "http://example.com" else: valid_value = "rhubarb" elif p['type'] == 'boolean': valid_value = not initial_config[name] elif p['type'] == "array" and p['items']['type'] == 'string': valid_value = ["custard", "cream"] else: raise NotImplementedError(p['type']) if name == 'enable_coproc': # Don't try enabling coproc, it has external dependencies continue if name == 'admin_api_require_auth': # Don't lock ourselves out of the admin API! continue if name == 'cloud_storage_enabled': # Enabling cloud storage requires setting other properties too continue updates[name] = valid_value patch_result = self.admin.patch_cluster_config(upsert=updates, remove=[]) self._wait_for_version_sync(patch_result['config_version']) def check_status(expect_restart): # Use one node's status, they should be symmetric status = self.admin.get_cluster_config_status()[0] self.logger.info(f"Status: {json.dumps(status, indent=2)}") assert status['invalid'] == [] assert status['restart'] is expect_restart def check_values(): read_back = self.admin.get_cluster_config() mismatch = [] for k, expect in updates.items(): # String-ized comparison, because the example values are strings, # whereas by the time we read them back they're properly typed. actual = read_back.get(k, None) if isinstance(actual, bool): # Lowercase because yaml and python capitalize bools differently. actual = str(actual).lower() else: actual = str(actual) if actual != str(expect): self.logger.error( f"Config set failed ({k}) {actual}!={expect}") mismatch.append((k, actual, expect)) assert len(mismatch) == 0 check_status(properties_require_restart) check_values() self.redpanda.restart_nodes(self.redpanda.nodes) # We have to sleep here because in the success case there is no status update # being sent: it's a no-op after node startup when they realize their config # status is the same as the one already reported. time.sleep(10) # Check after restart that confuration persisted and status shows valid check_status(False) check_values() def _export(self, all): with tempfile.NamedTemporaryFile('r') as file: self.rpk.cluster_config_export(file.name, all) return file.read() def _import(self, text, all, allow_noop=False): with tempfile.NamedTemporaryFile('w') as file: file.write(text) file.flush() import_stdout = self.rpk.cluster_config_import(file.name, all) last_line = import_stdout.strip().split("\n")[-1] m = re.match(r"^.+New configuration version is (\d+).*$", last_line) self.logger.debug(f"_import status: {last_line}") if m is None and allow_noop: return None assert m is not None, f"Config version not found: {last_line}" version = int(m.group(1)) return version def _export_import_modify_one(self, before: str, after: str, all=False): return self._export_import_modify([(before, after)], all) def _export_import_modify(self, changes: list[tuple[str, str]], all=False): text = self._export(all) # Validate that RPK gives us valid yaml _ = yaml.full_load(text) self.logger.debug(f"Exported config before modification: {text}") for before, after in changes: self.logger.debug(f"Replacing \"{before}\" with \"{after}\"") # Intentionally not passing this through a YAML deserialize/serialize # step during edit, to more realistically emulate someone hand editing text = text.replace(before, after) self.logger.debug(f"Exported config after modification: {text}") # Edit a setting, import the resulting document version = self._import(text, all) return version, text @cluster(num_nodes=3) def test_rpk_export_import(self): """ Test `rpk cluster config [export|import]` and implicitly also `edit` (which is just an export/import cycle with a text editor run in the middle) """ # An arbitrary tunable for checking --all tunable_property = 'kafka_qdc_depth_alpha' # RPK should give us a valid yaml document version_a, text = self._export_import_modify_one( "kafka_qdc_enable: false", "kafka_qdc_enable: true") assert version_a is not None self._wait_for_version_sync(version_a) # Default should not have included tunables assert tunable_property not in text # The setting we edited should be updated self._check_value_everywhere("kafka_qdc_enable", True) # Clear a setting, it should revert to its default version_b, text = self._export_import_modify_one( "kafka_qdc_enable: true", "") assert version_b is not None assert version_b > version_a self._wait_for_version_sync(version_b) self._check_value_everywhere("kafka_qdc_enable", False) # Check that an --all export includes tunables text_all = self._export(all=True) assert tunable_property in text_all # Check that editing a tunable with --all works version_c, text = self._export_import_modify_one( "kafka_qdc_depth_alpha: 0.8", "kafka_qdc_depth_alpha: 1.5", all=True) assert version_c is not None assert version_c > version_b self._wait_for_version_sync(version_c) self._check_value_everywhere("kafka_qdc_depth_alpha", 1.5) # Check that clearing a tunable with --all works version_d, text = self._export_import_modify_one( "kafka_qdc_depth_alpha: 1.5", "", all=True) assert version_d is not None assert version_d > version_c self._wait_for_version_sync(version_d) self._check_value_everywhere("kafka_qdc_depth_alpha", 0.8) # Check that an import/export with no edits does nothing. text = self._export(all=True) noop_version = self._import(text, allow_noop=True, all=True) assert noop_version is None @cluster(num_nodes=3) def test_rpk_import_validation(self): """ Verify that RPK handles 400 responses on import nicely """ # RPK should return an error with explanatory text try: _, out = self._export_import_modify( [("kafka_qdc_enable: false", "kafka_qdc_enable: rhubarb"), ("topic_fds_per_partition: 10", "topic_fds_per_partition: 9999"), ("default_num_windows: 10", "default_num_windows: 32768")], all=True) except RpkException as e: assert 'kafka_qdc_enable: expected type boolean' in e.stderr assert 'topic_fds_per_partition: too large' in e.stderr assert 'default_num_windows: out of range' in e.stderr else: raise RuntimeError( f"RPK command should have failed, but ran with output: {out}") @cluster(num_nodes=3) def test_rpk_edit_string(self): """ Test import/export of string fields, make sure they don't end up with extraneous quotes """ version_a, _ = self._export_import_modify_one( "cloud_storage_access_key:\n", "cloud_storage_access_key: foobar\n") self._wait_for_version_sync(version_a) self._check_value_everywhere("cloud_storage_access_key", "foobar") version_b, _ = self._export_import_modify_one( "cloud_storage_access_key: foobar\n", "cloud_storage_access_key: \"foobaz\"") self._wait_for_version_sync(version_b) self._check_value_everywhere("cloud_storage_access_key", "foobaz") @cluster(num_nodes=3) def test_rpk_status(self): """ This command is a thin wrapper over the status API that is covered more comprehensively in other tests: this case is just a superficial test that the command succeeds and returns info for each node. """ status_text = self.rpk.cluster_config_status() # Split into lines, skip first one (header) lines = status_text.strip().split("\n")[1:] # Example: # NODE CONFIG_VERSION NEEDS_RESTART INVALID UNKNOWN # 0 17 false [] [] assert len(lines) == len(self.redpanda.nodes) for i, l in enumerate(lines): m = re.match( r"^(\d+)\s+(\d+)\s+(true|false)\s+\[(.*)\]\s+\[(.*)\]$", l) assert m is not None node_id, *_ = m.groups() node = self.redpanda.nodes[i] assert int(node_id) == self.redpanda.idx(node) @cluster(num_nodes=3, log_allow_list=RESTART_LOG_ALLOW_LIST) def test_rpk_force_reset(self): """ Verify that RPK's `reset` command for disaster recovery works as expected: redpanda should start up and behave as if the property is its default value. """ # Set some non-default config value pr = self.admin.patch_cluster_config(upsert={ 'kafka_qdc_enable': True, 'append_chunk_size': 65536 }) self._wait_for_version_sync(pr['config_version']) self._check_value_everywhere("kafka_qdc_enable", True) # Reset the property on all nodes for node in self.redpanda.nodes: rpk_remote = RpkRemoteTool(self.redpanda, node) self.redpanda.stop_node(node) rpk_remote.cluster_config_force_reset("kafka_qdc_enable") self.redpanda.start_node(node) # Check that the reset property has reverted to its default self._check_value_everywhere("kafka_qdc_enable", False) # Check that the bystander config property was not reset self._check_value_everywhere("append_chunk_size", 65536) @cluster(num_nodes=1, log_allow_list=RESTART_LOG_ALLOW_LIST) def test_rpk_lint(self): """ Verify that if a redpanda config contains a cluster config property, then running `lint` cleans out that value, as it is no longer used. """ node = self.redpanda.nodes[0] # Put an old-style property in the config self.logger.info("Restarting with legacy property") self.redpanda.restart_nodes([node], override_cfg_params={ "kafka_qdc_enable": True, }) old_conf = node.account.ssh_output( "cat /etc/redpanda/redpanda.yaml").decode('utf-8') assert 'kafka_qdc_enable' in old_conf # Run lint self.logger.info("Linting config") rpk_remote = RpkRemoteTool(self.redpanda, node) rpk_remote.cluster_config_lint() # Check that the old style config property was removed new_conf = node.account.ssh_output( "cat /etc/redpanda/redpanda.yaml").decode('utf-8') assert 'kafka_qdc_enable' not in new_conf # Check that the linted config file is not corrupt (redpanda loads with it) self.logger.info("Restarting with linted config") self.redpanda.stop_node(node) self.redpanda.start_node(node, write_config=False) @cluster(num_nodes=1) def test_rpk_get_set(self): """ Test RPK's getter+setter helpers """ Example = namedtuple('Example', ['key', 'strval', 'yamlval']) valid_examples = [ Example("kafka_qdc_enable", "true", True), Example("append_chunk_size", "32768", 32768), Example("superusers", "['bob','alice']", ["bob", "alice"]) ] def yamlize(input): """Create a YAML representation that matches what yaml-cpp produces: PyYAML includes trailing ellipsis lines that must be removed.""" return "\n".join([ i for i in yaml.dump(e.yamlval).split("\n") if i.strip() != "..." ]).strip() # Check that valid changes are accepted, and the change is reflected # in the underlying API-visible configuration for e in valid_examples: self.logger.info(f"Checking {e.key}={e.strval} ({e.yamlval})") self.rpk.cluster_config_set(e.key, e.strval) # CLI readback should give same as we set cli_readback = self.rpk.cluster_config_get(e.key) expect_cli_readback = yamlize(e.yamlval) self.logger.info( f"CLI readback '{cli_readback}' expect '{expect_cli_readback}'" ) assert cli_readback == expect_cli_readback # API readback should give properly structured+typed value api_readback = self.admin.get_cluster_config()[e.key] self.logger.info(f"API readback for {e.key} '{api_readback}'") assert api_readback == e.yamlval # Check that the `set` command hits proper validation paths invalid_examples = [ ("kafka_qdc_enable", "rhubarb"), ("append_chunk_size", "-123"), ("superusers", "43"), ] for key, strval in invalid_examples: try: self.rpk.cluster_config_set(key, strval) except RpkException as e: pass else: self.logger.error( f"Config setting {key}={strval} should have been rejected") assert False # Check that resetting properties to their default via `set` works default_examples = [ ("kafka_qdc_enable", False), ("append_chunk_size", 16384), ("superusers", []), ] for key, expect_default in default_examples: self.rpk.cluster_config_set(key, "") api_readback = self.admin.get_cluster_config()[key] self.logger.info( f"API readback for {key} '{api_readback}' (expect {expect_default})" ) assert api_readback == expect_default @cluster(num_nodes=3) def test_secret_redaction(self): def set_and_search(key, value, expect_log): patch_result = self.admin.patch_cluster_config(upsert={key: value}) self._wait_for_version_sync(patch_result['config_version']) # Check value was/was not printed to log while applying assert search_log(self.redpanda, value) is expect_log # Check we do/don't print on next startup self.redpanda.restart_nodes(self.redpanda.nodes) assert search_log(self.redpanda, value) is expect_log secret_key = "cloud_storage_secret_key" secret_value = "ThePandaFliesTonight" set_and_search(secret_key, secret_value, False) # To avoid false negatives in the test of a secret, go through the same procedure # but on a non-secret property, thereby validating that our log scanning procedure # would have detected the secret if it had been printed unsecret_key = "cloud_storage_api_endpoint" unsecret_value = "http://nowhere" set_and_search(unsecret_key, unsecret_value, True) @cluster(num_nodes=3) def test_incremental_alter_configs(self): """ Central config can also be accessed via Kafka API -- exercise that using `kcl`. :param incremental: whether to use incremental kafka config API or legacy config API. """ # Redpanda only support incremental config changes: the legacy # AlterConfig API is a bad user experience incremental = True # Set a property by its redpanda name out = self.client().alter_broker_config( {"log_message_timestamp_type": "CreateTime"}, incremental) # kcl does not set an error exist status when config set fails, so must # read its output text to validate that calls are successful assert 'OK' in out out = self.client().alter_broker_config( {"log_message_timestamp_type": "LogAppendTime"}, incremental) assert 'OK' in out if incremental: self.client().delete_broker_config(["log_message_timestamp_type"], incremental) assert 'OK' in out # Set a property by its Kafka-interop names and values kafka_props = { "log.message.timestamp.type": ["CreateTime", "LogAppendTime"], "log.cleanup.policy": ["compact", "delete"], "log.compression.type": ["gzip", "snappy", "lz4", "zstd"], } for property, value_list in kafka_props.items(): for value in value_list: out = self.client().alter_broker_config({property: value}, incremental) assert 'OK' in out # Set a nonexistent property out = self.client().alter_broker_config({"does_not_exist": "avalue"}, incremental) assert 'INVALID_CONFIG' in out # Set a malformed property out = self.client().alter_broker_config( {"log_message_timestamp_type": "BadValue"}, incremental) assert 'INVALID_CONFIG' in out # Set a property on a named broker: should fail because this # interface is only for cluster-wide properties out = self.client().alter_broker_config( {"log_message_timestamp_type": "CreateTime"}, incremental, broker=1) assert 'INVALID_CONFIG' in out assert "Setting broker properties on named brokers is unsupported" in out @cluster(num_nodes=3) def test_alter_configs(self): """ We only support incremental config changes. Check that AlterConfigs requests are correctly handled with an 'unsupported' response. """ out = self.client().alter_broker_config( {"log_message_timestamp_type": "CreateTime"}, incremental=False) self.logger.info("AlterConfigs output: {out}") assert 'INVALID_CONFIG' in out assert "changing broker properties isn't supported via this API" in out @cluster(num_nodes=3) def test_cloud_validation(self): """ Cloud storage configuration has special multi-property rules, check they are enforced. """ # It is invalid to enable cloud storage without its accompanying properties invalid_update = {'cloud_storage_enabled': True} with expect_http_error(400): self.admin.patch_cluster_config(upsert=invalid_update, remove=[]) # It is valid to enable cloud storage along with its accompanying properties valid_update = { 'cloud_storage_enabled': True, 'cloud_storage_secret_key': 'open', 'cloud_storage_access_key': 'sesame', 'cloud_storage_region': 'us-east-1', 'cloud_storage_bucket': 'dearliza' } patch_result = self.admin.patch_cluster_config(upsert=valid_update, remove=[]) self._wait_for_version_sync(patch_result['config_version']) # Check we really set it properly, and Redpanda can restart without # hitting a validation issue on startup (this is what would happen # if the API validation wasn't working properly) self.redpanda.restart_nodes(self.redpanda.nodes) # It is invalid to clear any required cloud storage properties while # cloud storage is enabled forbidden_to_clear = [ 'cloud_storage_secret_key', 'cloud_storage_access_key', 'cloud_storage_region', 'cloud_storage_bucket' ] for key in forbidden_to_clear: with expect_http_error(400): self.admin.patch_cluster_config(upsert={}, remove=[key]) # Switching off cloud storage is always valid, we can leave the other # properties set patch_result = self.admin.patch_cluster_config( upsert={'cloud_storage_enabled': False}, remove=[]) self._wait_for_version_sync(patch_result['config_version']) # Clearing related properties is valid now that cloud storage is # disabled for key in forbidden_to_clear: self.admin.patch_cluster_config(upsert={}, remove=[key]) @cluster(num_nodes=3) def test_cluster_id(self): """ That the cluster_id exposed in Kafka metadata is automatically populated with a uuid, that it starts with redpanda. and that it can be overridden by setting the property to something else. """ rpk = RpkTool(self.redpanda) # An example, we will compare lengths with this uuid_example = "redpanda.87e8c0c3-7c2a-4f7b-987f-11fc1d2443a4" def has_uuid_cluster_id(): cluster_id = rpk.cluster_metadata_id() self.logger.info(f"cluster_id={cluster_id}") return cluster_id is not None and len(cluster_id) == len( uuid_example) # This is a wait_until because the initialization of cluster_id # is async and can happen after the cluster starts answering Kafka requests. wait_until(has_uuid_cluster_id, timeout_sec=20, backoff_sec=1) # Verify that the cluster_id does not change on a restart initial_cluster_id = rpk.cluster_metadata_id() self.redpanda.restart_nodes(self.redpanda.nodes) assert rpk.cluster_metadata_id() == initial_cluster_id # Verify that a manually set cluster_id is respected manual_id = "rhubarb" self.redpanda.set_cluster_config(values={"cluster_id": manual_id}, expect_restart=False) assert rpk.cluster_metadata_id() == f"redpanda.{manual_id}"