def set_scylla_sysctl_value(db_cluster: ScyllaPodCluster, sysctl_name, sysctl_value: str) -> None: sysctls = db_cluster.get_scylla_cluster_plain_value('/spec/sysctls') sysctl_to_set = f"{sysctl_name}={sysctl_value}" for i, _ in enumerate(sysctls): if sysctls[i].startswith(f"{sysctl_name}="): sysctls[i] = sysctl_to_set break else: sysctls.append(sysctl_to_set) db_cluster.replace_scylla_cluster_value("/spec/sysctls", sysctls)
def _bring_cluster_back_to_original_state(db_cluster: ScyllaPodCluster, config_map: dict, original_scylla_cluster_spec: dict): restart = False try: # Restore cluster spec, there is one problem though: # s-o does not support rack removal, so if we see an extra rack we need to remove members form it # but keep it in the cluster spec # Other alternative is to redeploy cluster, which is expensive and we will do so only if it is found needed original_rack_specs = original_scylla_cluster_spec.get( 'datacenter', {}).get('racks', []) current_cluster_spec = db_cluster.get_scylla_cluster_plain_value( '/spec') current_rack_specs = current_cluster_spec.get('datacenter', {}).get('racks', []) if len(original_rack_specs) < len(current_rack_specs): # A new racks with 0 members in them to the original cluster specification # At this point original_rack_specs is more like cluster spec we want to have new_racks = current_rack_specs[len(original_rack_specs):] for rack in new_racks: rack['members'] = 0 original_rack_specs.extend(new_racks) # NOTE: ignore 'forceRedeploymentReason' field always to avoid redundant restarts original_scylla_cluster_spec.pop("forceRedeploymentReason", None) current_cluster_spec.pop("forceRedeploymentReason", None) if original_scylla_cluster_spec != current_cluster_spec: # If cluster spec we currently have is not equal to what we want replace it and # remember to restart the cluster afterwards db_cluster.replace_scylla_cluster_value( '/spec', original_scylla_cluster_spec) restart = True # Restore config-map scylla-config with db_cluster.scylla_config_map as recover_config_map: if recover_config_map != config_map: # if config map is changed scylla will be restarted therefore we don't have to explicitly restart it restart = False recover_config_map.clear() recover_config_map.update(config_map) if restart: db_cluster.restart_scylla() except Exception as exc: # pylint: disable=broad-except tester.healthy_flag = False pytest.fail( "Failed to bring cluster nodes back to original number due to :\n" + "".join( traceback.format_exception(type(exc), exc, exc.__traceback__)))
def get_scylla_sysctl_value(db_cluster: ScyllaPodCluster, sysctl_name: str) -> int: sysctls = db_cluster.get_scylla_cluster_plain_value('/spec/sysctls') for sysctl in sysctls: if sysctl.startswith(f"{sysctl_name}="): return int(sysctl.split("=")[-1]) raise ValueError(f"Cannot find '{sysctl_name}' sysctl")
def test_ha_update_spec_while_rollout_restart(db_cluster: ScyllaPodCluster): """ Cover the issue https://github.com/scylladb/scylla-operator/issues/410 Validate that cluster resources can be updated while the scylla-operator is rolling out. - update cluster specification a few time - start rollout restart in parallel with the update - validate that the cluster specification has been updated """ terminate_change_spec_thread = threading.Event() value = 1048576 crd_update_errors = [] def change_cluster_spec(): nonlocal value nonlocal crd_update_errors while not terminate_change_spec_thread.wait(0.1): try: db_cluster.replace_scylla_cluster_value( '/spec/sysctls', [f"fs.aio-max-nr={value + 1}"]) # increase the value just when the sysctls spec value has been updated - to prevent the situation when # value was increased, but sysctls spec value was not updated value += 1 except Exception as error: # pylint: disable=broad-except log.debug("Change /spec/sysctls value to %d failed. Error: %s", value, str(error)) crd_update_errors.append(str(error)) change_cluster_spec_thread = threading.Thread(target=change_cluster_spec, daemon=True) log.info("Start update cluster specification") change_cluster_spec_thread.start() log.info("Start rollout restart") scylla_operator_rollout_restart(db_cluster) operator_rollout_errors = wait_for_scylla_operator_rollout_complete( db_cluster) assert not operator_rollout_errors, "Rollout restart failed. Reasons: {}".format( '\n'.join(operator_rollout_errors)) log.info("Stop update cluster specification") terminate_change_spec_thread.set() change_cluster_spec_thread.join() assert not crd_update_errors, \ "Found following errors during rollout restart: {}".format("\n".join(crd_update_errors)) sysctl_value = db_cluster.get_scylla_cluster_plain_value('/spec/sysctls') expected_sysctl_value = [f"fs.aio-max-nr={value}"] assert expected_sysctl_value == sysctl_value, \ f"Cluster specification has not been updated. Expected {expected_sysctl_value}, actual {sysctl_value}"
def _bring_cluster_back_to_original_state( db_cluster: ScyllaPodCluster, config_map: dict, original_scylla_cluster_spec: dict ): restart = False try: # Restore cluster spec, there is one problem though: # s-o does not support rack removal, so if we see an extra rack we need to remove members form it # but keep it in the cluster spec # Other alternative is to redeploy cluster, which is expensive and we will do so only if it is found needed original_rack_specs = original_scylla_cluster_spec.get('datacenter', {}).get('racks', []) current_cluster_spec = db_cluster.get_scylla_cluster_plain_value('/spec') current_rack_specs = current_cluster_spec.get('datacenter', {}).get('racks', []) if len(original_rack_specs) < len(current_rack_specs): # A new racks with 0 members in them to the original cluster specification # At this point original_rack_specs is more like cluster spec we want to have new_racks = current_rack_specs[len(original_rack_specs):] for rack in new_racks: rack['members'] = 0 original_rack_specs.extend(new_racks) # Restore config-map scylla-config with db_cluster.scylla_config_map as recover_config_map: if recover_config_map != config_map: recover_config_map.clear() recover_config_map.update(config_map) restart = True # NOTE: ignore 'forceRedeploymentReason' field always to avoid redundant restarts original_scylla_cluster_spec.pop("forceRedeploymentReason", None) current_cluster_spec.pop("forceRedeploymentReason", None) if original_scylla_cluster_spec != current_cluster_spec: # If cluster spec we currently have is not equal to what we want replace it. # It will cause scylla pods rollout restart on the operator level. # WARNING: if number of nodes differs than we will have incorrect data # in "db_cluster.nodes". For the moment all changes to node number must # go though 'add_nodes' and 'decommision' methods only. db_cluster.replace_scylla_cluster_value('/spec', original_scylla_cluster_spec) db_cluster.wait_sts_rollout_restart(len(db_cluster.nodes)) restart = False if restart: db_cluster.restart_scylla() except Exception as exc: # pylint: disable=broad-except tester.healthy_flag = False pytest.fail("Failed to bring cluster nodes back to original number due to :\n" + "".join(traceback.format_exception(type(exc), exc, exc.__traceback__)))