def test_query_preempted(self): priority_pod, priority_svc, _, _ = alerts.random_pod_choice_for_callbacks( chi) def run_queries_with_priority(): sql = "" for i in range(50): sql += f"SET priority={i % 20};SELECT uniq(number) FROM numbers(20000000):" cmd = f"echo \\\"{sql} SELECT 1\\\" | xargs -i'{{}}' --no-run-if-empty -d ':' -P 20 clickhouse-client --time -m -n -q \\\"{{}}\\\"" kubectl.launch(f"exec {priority_pod} -- bash -c \"{cmd}\"", timeout=120) clickhouse.query( chi["metadata"]["name"], "SELECT event_time, CurrentMetric_QueryPreempted FROM system.metric_log WHERE CurrentMetric_QueryPreempted > 0", host=priority_svc, ) with Then("check ClickHouseQueryPreempted firing"): fired = alerts.wait_alert_state( "ClickHouseQueryPreempted", "firing", True, labels={"hostname": priority_svc}, time_range='30s', sleep_time=settings.prometheus_scrape_interval, callback=run_queries_with_priority) assert fired, error( "can't get ClickHouseQueryPreempted alert in firing state") with Then("check ClickHouseQueryPreempted gone away"): resolved = alerts.wait_alert_state("ClickHouseQueryPreempted", "firing", False, labels={"hostname": priority_svc}) assert resolved, error( "can't check ClickHouseQueryPreempted alert is gone away")
def test_longest_running_query(self): long_running_pod, long_running_svc, _, _ = alerts.random_pod_choice_for_callbacks( chi) # 600s trigger + 2*30s - double prometheus scraping interval clickhouse.query( chi["metadata"]["name"], "SELECT now(),sleepEachRow(1),number FROM system.numbers LIMIT 660", host=long_running_svc, timeout=670) with Then("check ClickHouseLongestRunningQuery firing"): fired = alerts.wait_alert_state("ClickHouseLongestRunningQuery", "firing", True, labels={"hostname": long_running_svc}, time_range='30s') assert fired, error( "can't get ClickHouseLongestRunningQuery alert in firing state") with Then("check ClickHouseLongestRunningQuery gone away"): resolved = alerts.wait_alert_state( "ClickHouseLongestRunningQuery", "firing", False, labels={"hostname": long_running_svc}) assert resolved, error( "can't check ClickHouseLongestRunningQuery alert is gone away")
def test_018(): create_and_check("configs/test-018-configmap.yaml", { "pod_count": 1, "do_not_delete": 1 }) chi_name = "test-018-configmap" with Then("user1/networks/ip should be in config"): chi = kube_get("chi", chi_name) assert "user1/networks/ip" in chi["spec"]["configuration"]["users"] start_time = kube_get_field("pod", f"chi-{chi_name}-default-0-0-0", ".status.startTime") create_and_check("configs/test-018-configmap-2.yaml", { "pod_count": 1, "do_not_delete": 1 }) with Then("user2/networks should be in config"): chi = kube_get("chi", chi_name) assert "user2/networks/ip" in chi["spec"]["configuration"]["users"] with And("user1/networks/ip should NOT be in config"): assert "user1/networks/ip" not in chi["spec"]["configuration"][ "users"] with And("Pod should not be restarted"): new_start_time = kube_get_field("pod", f"chi-{chi_name}-default-0-0-0", ".status.startTime") assert start_time == new_start_time kube_delete_chi(chi_name)
def test_020(config="configs/test-020-multi-volume.yaml"): chi = get_chi_name(get_full_path(config)) create_and_check( config, { "pod_count": 1, "pod_volumes": {"/var/lib/clickhouse", "/var/lib/clickhouse2"}, "do_not_delete": 1 }) with When("Create a table and insert 1 row"): clickhouse_query( chi, "create table test_disks(a Int8) Engine = MergeTree() order by a") clickhouse_query(chi, "insert into test_disks values (1)") with Then("Data should be placed on default disk"): out = clickhouse_query( chi, "select disk_name from system.parts where table='test_disks'") assert out == 'default' with When("alter table test_disks move partition tuple() to disk 'disk2'"): clickhouse_query( chi, "alter table test_disks move partition tuple() to disk 'disk2'") with Then("Data should be placed on disk2"): out = clickhouse_query( chi, "select disk_name from system.parts where table='test_disks'") assert out == 'disk2' kube_delete_chi(chi)
def test_replicas_max_abosulute_delay(): stop_replica_pod, stop_replica_svc, insert_pod, insert_svc = random_pod_choice_for_callbacks() create_table_on_cluster('all-replicated', 'default.test_repl', '(event_time DateTime, test UInt64) ENGINE ReplicatedMergeTree(\'/clickhouse/tables/{installation}-{shard}/{uuid}/test_repl\', \'{replica}\') ORDER BY tuple()') prometheus_scrape_interval = 15 def restart_clickhouse_and_insert_to_replicated_table(): with When(f"stop replica fetches on {stop_replica_svc}"): sql = "SYSTEM STOP FETCHES default.test_repl" kubectl.launch( f"exec -n {kubectl.namespace} {stop_replica_pod} -c clickhouse -- clickhouse-client -q \"{sql}\"", ok_to_fail=True, ) sql = "INSERT INTO default.test_repl SELECT now(), number FROM numbers(100000)" kubectl.launch( f"exec -n {kubectl.namespace} {insert_pod} -c clickhouse -- clickhouse-client -q \"{sql}\"", ) with Then("check ClickHouseReplicasMaxAbsoluteDelay firing"): fired = wait_alert_state("ClickHouseReplicasMaxAbsoluteDelay", "firing", True, labels={"hostname": stop_replica_svc}, time_range='60s', sleep_time=prometheus_scrape_interval * 2, callback=restart_clickhouse_and_insert_to_replicated_table) assert fired, error("can't get ClickHouseReadonlyReplica alert in firing state") clickhouse.query( chi["metadata"]["name"], "SYSTEM START FETCHES; SYSTEM RESTART REPLICAS; SYSTEM SYNC REPLICA default.test_repl", host=stop_replica_svc, timeout=240 ) with Then("check ClickHouseReplicasMaxAbsoluteDelay gone away"): resolved = wait_alert_state("ClickHouseReplicasMaxAbsoluteDelay", "firing", False, labels={"hostname": stop_replica_svc}) assert resolved, error("can't check ClickHouseReplicasMaxAbsoluteDelay alert is gone away") drop_table_on_cluster('all-replicated', 'default.test_repl')
def test_009(): version_from = "0.6.0" version_to = "dev" with Given(f"clickhouse-operator {version_from}"): set_operator_version(version_from) config = get_full_path("configs/test-009-long-name.yaml") chi_full_name = get_chi_name(config) chi_cut_name = chi_full_name[0:15] kube_apply(config) kube_wait_objects(chi_cut_name, [1, 1, 2]) kube_wait_chi_status(chi_full_name, "Completed") assert kube_get_count( "statefulset", label="-l clickhouse.altinity.com/app=chop") == 1, error() with Then(f"upgrade operator to {version_to}"): set_operator_version(version_to) with And("Wait 20 seconds"): time.sleep(20) with Then("No new statefulsets should be created"): assert kube_get_count( "statefulset", label="-l clickhouse.altinity.com/app=chop" ) == 1, error()
def test_009(version_from="0.8.0", version_to=settings.operator_version): with Then("Test simple chi for operator upgrade"): test_operator_upgrade("configs/test-009-operator-upgrade.yaml", version_from, version_to) with Then("Test advanced chi for operator upgrade"): test_operator_upgrade("configs/test-009-operator-upgrade-2.yaml", version_from, version_to)
def test_015(): kubectl.create_and_check(config="configs/test-015-host-network.yaml", check={ "pod_count": 2, "do_not_delete": 1, }) time.sleep(30) with Then("Query from one server to another one should work"): out = clickhouse.query( "test-015-host-network", host="chi-test-015-host-network-default-0-0", port="10000", sql= "SELECT * FROM remote('chi-test-015-host-network-default-0-1', system.one)" ) print("remote out=") print(out) with Then("Distributed query should work"): out = clickhouse.query( "test-015-host-network", host="chi-test-015-host-network-default-0-0", port="10000", sql= "SELECT count() FROM cluster('all-sharded', system.one) settings receive_timeout=10" ) print("cluster out=") print(out) assert out == "2" kubectl.delete_chi("test-015-host-network")
def test_006(): with Then("Create initial position"): kubectl.create_and_check(config="configs/test-006-ch-upgrade-1.yaml", check={ "pod_count": 2, "pod_image": "yandex/clickhouse-server:19.11", "do_not_delete": 1, }) with Then( "Use different podTemplate and confirm that pod image is updated"): kubectl.create_and_check(config="configs/test-006-ch-upgrade-2.yaml", check={ "pod_count": 2, "pod_image": "yandex/clickhouse-server:19.16", "do_not_delete": 1, }) with Then( "Change image in podTemplate itself and confirm that pod image is updated" ): kubectl.create_and_check(config="configs/test-006-ch-upgrade-3.yaml", check={ "pod_count": 2, "pod_image": "yandex/clickhouse-server:19.11", })
def test_read_only_replica(): read_only_pod, read_only_svc, other_pod, other_svc = random_pod_choice_for_callbacks() chi_name = chi["metadata"]["name"] create_replicated_table_on_cluster() def restart_zookeeper(): kubectl.kubectl( f"exec -n {kubectl.namespace} zookeeper-0 -- sh -c \"kill 1\"", ok_to_fail=True, ) clickhouse.clickhouse_query_with_error(chi_name, "INSERT INTO default.test_repl VALUES(now(),rand())", host=read_only_svc) with Then("check ClickHouseReadonlyReplica firing"): fired = wait_alert_state("ClickHouseReadonlyReplica", "firing", True, labels={"hostname": read_only_svc}, time_range='30s', sleep_time=5, callback=restart_zookeeper) assert fired, error("can't get ClickHouseReadonlyReplica alert in firing state") with Then("check ClickHouseReadonlyReplica gone away"): resolved = wait_alert_state("ClickHouseReadonlyReplica", "firing", False, labels={"hostname": read_only_svc}) assert resolved, error("can't check ClickHouseReadonlyReplica alert is gone away") kubectl.kube_wait_pod_status("zookeeper-0", "Running", ns=kubectl.namespace) kubectl.kube_wait_jsonpath("pod", "zookeeper-0", "{.status.containerStatuses[0].ready}", "true", ns=kubectl.namespace) clickhouse.clickhouse_query_with_error( chi_name, "SYSTEM RESTART REPLICAS; SYSTEM SYNC REPLICA default.test_repl", host=read_only_svc, timeout=240 ) clickhouse.clickhouse_query_with_error( chi_name, "SYSTEM RESTART REPLICAS; SYSTEM SYNC REPLICA default.test_repl", host=other_svc, timeout=240 ) drop_replicated_table_on_cluster()
def test_016(): create_and_check( "configs/test-016-settings.yaml", { "apply_templates": {settings.clickhouse_template}, "pod_count": 1, "do_not_delete": 1 }) with Then("dictGet() should work"): out = clickhouse_query( "test-016-settings", query="select dictGet('one', 'one', toUInt64(0))") assert out == "0" with Then("Custom macro 'layer' should be available"): out = clickhouse_query( "test-016-settings", query="select substitution from system.macros where macro='layer'") assert out == "01" with Then("query_log should be disabled"): clickhouse_query("test-016-settings", query="system flush logs") out = clickhouse_query_with_error( "test-016-settings", query="select count() from system.query_log") assert "doesn't exist" in out kube_delete_chi("test-016-settings")
def test_replicas_max_abosulute_delay(): stop_replica_pod, stop_replica_svc, insert_pod, insert_svc = random_pod_choice_for_callbacks() create_replicated_table_on_cluster() prometheus_scrape_interval = 30 def restart_clickhouse_and_insert_to_replicated_table(): with When(f"stop replica fetches on {stop_replica_svc}"): sql = "SYSTEM STOP FETCHES default.test_repl" kubectl.kubectl( f"exec -n {kubectl.namespace} {stop_replica_pod} -c clickhouse -- clickhouse-client -q \"{sql}\"", ok_to_fail=True, ) sql = "INSERT INTO default.test_repl SELECT now(), number FROM numbers(100000)" kubectl.kubectl( f"exec -n {kubectl.namespace} {insert_pod} -c clickhouse -- clickhouse-client -q \"{sql}\"", ) with Then("check ReplicasMaxAbsoluteDelay firing"): fired = wait_alert_state("ReplicasMaxAbsoluteDelay", "firing", True, labels={"hostname": stop_replica_svc}, time_range='60s', sleep_time=prometheus_scrape_interval*2, callback=restart_clickhouse_and_insert_to_replicated_table) assert fired, error("can't get ReadonlyReplica alert in firing state") clickhouse.clickhouse_query( chi["metadata"]["name"], "SYSTEM START FETCHES; SYSTEM RESTART REPLICAS; SYSTEM SYNC REPLICA default.test_repl", timeout=240 ) with Then("check ReplicasMaxAbsoluteDelay gone away"): resolved = wait_alert_state("ReplicasMaxAbsoluteDelay", "firing", False, labels={"hostname": stop_replica_svc}) assert resolved, error("can't check ReplicasMaxAbsoluteDelay alert is gone away") drop_replicated_table_on_cluster()
def test_ch_002(self): kubectl.create_and_check( "configs/test-ch-002-row-level.yaml", { "apply_templates": {"templates/tpl-clickhouse-20.3.yaml"}, "do_not_delete": 1, }) chi = "test-ch-002-row-level" create_table = """create table test (d Date default today(), team LowCardinality(String), user String) Engine = MergeTree() PARTITION BY d ORDER BY d;""" with When("Create test table"): clickhouse.query(chi, create_table) with And("Insert some data"): clickhouse.query( chi, "INSERT INTO test(team, user) values('team1', 'user1'),('team2', 'user2'),('team3', 'user3'),('team4', 'user4')" ) with Then( "Make another query for different users. It should be restricted to corresponding team by row-level security" ): for user in ['user1', 'user2', 'user3', 'user4']: out = clickhouse.query(chi, "select user from test", user=user) assert out == user with Then( "Make a count() query for different users. It should be restricted to corresponding team by row-level security" ): for user in ['user1', 'user2', 'user3', 'user4']: out = clickhouse.query(chi, "select count() from test", user=user) assert out == "1" kubectl.delete_chi(chi)
def test_read_only_replica(): read_only_pod, read_only_svc, other_pod, other_svc = random_pod_choice_for_callbacks() chi_name = chi["metadata"]["name"] create_table_on_cluster('all-replicated', 'default.test_repl', '(event_time DateTime, test UInt64) ENGINE ReplicatedMergeTree(\'/clickhouse/tables/{installation}-{shard}/{uuid}/test_repl\', \'{replica}\') ORDER BY tuple()') def restart_zookeeper(): kubectl.launch( f"exec -n {kubectl.namespace} zookeeper-0 -- sh -c \"kill 1\"", ok_to_fail=True, ) clickhouse.query_with_error(chi_name, "INSERT INTO default.test_repl VALUES(now(),rand())", host=read_only_svc) with Then("check ClickHouseReadonlyReplica firing"): fired = wait_alert_state("ClickHouseReadonlyReplica", "firing", True, labels={"hostname": read_only_svc}, time_range='30s', sleep_time=5, callback=restart_zookeeper) assert fired, error("can't get ClickHouseReadonlyReplica alert in firing state") with Then("check ClickHouseReadonlyReplica gone away"): resolved = wait_alert_state("ClickHouseReadonlyReplica", "firing", False, labels={"hostname": read_only_svc}) assert resolved, error("can't check ClickHouseReadonlyReplica alert is gone away") kubectl.wait_pod_status("zookeeper-0", "Running", ns=kubectl.namespace) kubectl.wait_jsonpath("pod", "zookeeper-0", "{.status.containerStatuses[0].ready}", "true", ns=kubectl.namespace) clickhouse.query_with_error( chi_name, "SYSTEM RESTART REPLICAS; SYSTEM SYNC REPLICA default.test_repl", host=read_only_svc, timeout=240 ) clickhouse.query_with_error( chi_name, "SYSTEM RESTART REPLICAS; SYSTEM SYNC REPLICA default.test_repl", host=other_svc, timeout=240 ) drop_table_on_cluster('all-replicated', 'default.test_repl')
def test_metrics_exporter_with_multiple_clickhouse_version(): def check_monitoring_metrics(operator_namespace, operator_pod, expect_result, max_retries=10): with And(f"metrics-exporter /metrics enpoint result should match with {expect_result}"): for i in range(1, max_retries): out = kubectl.launch( f"exec {operator_pod} -c metrics-exporter -- wget -O- -q http://127.0.0.1:8888/metrics", ns=operator_namespace ) all_strings_expected_done = True for string, exists in expect_result.items(): all_strings_expected_done = (exists == (string in out)) if not all_strings_expected_done: break if all_strings_expected_done: break with Then("Not ready. Wait for " + str(i * 5) + " seconds"): time.sleep(i * 5) assert all_strings_expected_done, error() with Given("clickhouse-operator pod exists"): out = kubectl.launch("get pods -l app=clickhouse-operator", ns='kube-system').splitlines()[1] operator_pod = re.split(r'[\t\r\n\s]+', out)[0] operator_namespace = "kube-system" with Then("check empty /metrics"): kubectl.delete_ns(kubectl.namespace, ok_to_fail=True) kubectl.create_ns(kubectl.namespace) check_monitoring_metrics(operator_namespace, operator_pod, expect_result={ 'chi_clickhouse_metric_VersionInteger': False, }) with Then("Install multiple clickhouse version"): config = util.get_full_path("configs/test-017-multi-version.yaml") kubectl.create_and_check( config=config, check={ "object_counts": { "statefulset": 4, "pod": 4, "service": 5, }, "do_not_delete": True, }) with And("Check not empty /metrics"): check_monitoring_metrics(operator_namespace, operator_pod, expect_result={ '# HELP chi_clickhouse_metric_VersionInteger': True, '# TYPE chi_clickhouse_metric_VersionInteger gauge': True, 'chi_clickhouse_metric_VersionInteger{chi="test-017-multi-version",hostname="chi-test-017-multi-version-default-0-0': True, 'chi_clickhouse_metric_VersionInteger{chi="test-017-multi-version",hostname="chi-test-017-multi-version-default-1-0': True, 'chi_clickhouse_metric_VersionInteger{chi="test-017-multi-version",hostname="chi-test-017-multi-version-default-2-0': True, 'chi_clickhouse_metric_VersionInteger{chi="test-017-multi-version",hostname="chi-test-017-multi-version-default-3-0': True, }) with Then("check empty /metrics after delete namespace"): kubectl.delete_ns(kubectl.namespace) check_monitoring_metrics(operator_namespace, operator_pod, expect_result={ 'chi_clickhouse_metric_VersionInteger': False, })
def test_clickhouse_server_reboot(): random_idx = random.randint(0, 1) clickhouse_pod = chi["status"]["pods"][random_idx] clickhouse_svc = chi["status"]["fqdns"][random_idx] def reboot_clickhouse_server(): kubectl.launch( f"exec -n {kubectl.namespace} {clickhouse_pod} -c clickhouse -- kill 1", ok_to_fail=True, ) with When("reboot clickhouse-server pod"): fired = wait_alert_state("ClickHouseServerDown", "firing", True, labels={"hostname": clickhouse_svc, "chi": chi["metadata"]["name"]}, callback=reboot_clickhouse_server, sleep_time=5, time_range='30s', max_try=30, ) assert fired, error("can't get ClickHouseServerDown alert in firing state") with Then("check ClickHouseServerDown gone away"): resolved = wait_alert_state("ClickHouseServerDown", "firing", False, labels={"hostname": clickhouse_svc}, time_range='5s', sleep_time=5) assert resolved, error("can't check ClickHouseServerDown alert is gone away") with Then("check ClickHouseServerRestartRecently firing and gone away"): fired = wait_alert_state("ClickHouseServerRestartRecently", "firing", True, labels={"hostname": clickhouse_svc, "chi": chi["metadata"]["name"]}, time_range="30s") assert fired, error("after ClickHouseServerDown gone away, ClickHouseServerRestartRecently shall firing") resolved = wait_alert_state("ClickHouseServerRestartRecently", "firing", False, labels={"hostname": clickhouse_svc}) assert resolved, error("can't check ClickHouseServerRestartRecently alert is gone away")
def test_015(): create_and_check("configs/test-015-host-network.yaml", { "pod_count": 2, "do_not_delete": 1 }) with Then("Query from one server to another one should work"): clickhouse_query( "test-015-host-network", host="chi-test-015-host-network-default-0-0", port="10000", query= "select * from remote('chi-test-015-host-network-default-0-1', system.one)" ) with Then("Distributed query should work"): out = clickhouse_query( "test-015-host-network", host="chi-test-015-host-network-default-0-0", port="10000", query= "select count() from cluster('all-sharded', system.one) settings receive_timeout=10" ) assert out == "2" kube_delete_chi("test-015-host-network")
def test_013(): create_and_check( "configs/test-013-add-shards-1.yaml", { "apply_templates": {settings.clickhouse_template}, "object_counts": [1, 1, 2], "do_not_delete": 1 }) with Then("Create local and distributed table"): clickhouse_query( "test-013-add-shards", "CREATE TABLE test_local Engine = Log as select * from system.one") clickhouse_query( "test-013-add-shards", "CREATE TABLE test_distr as test_local Engine = Distributed('default', default, test_local)" ) with Then("Add one more shard"): create_and_check("configs/test-013-add-shards-2.yaml", { "object_counts": [2, 2, 3], "do_not_delete": 1 }) with And("Table should be created on a second shard"): out = clickhouse_query("test-013-add-shards", "select count() from default.test_distr", host="chi-test-013-add-shards-default-1-0") assert out == "1" with Then("Remove shard"): create_and_check("configs/test-013-add-shards-1.yaml", {"object_counts": [1, 1, 2]})
def check_alert_state(alert_name, prometheus_pod, alert_state="firing", labels=None, time_range="10s"): with Then( f"check {alert_name} for state {alert_state} and {labels} labels in {time_range}" ): cmd = f"exec -n {settings.prometheus_namespace} {prometheus_pod} -c prometheus -- " cmd += "wget -qO- 'http://127.0.0.1:9090/api/v1/query?query=ALERTS{" if labels is None: labels = {} if not isinstance(labels, dict): fail(f"Invalid labels={labels}") labels.update({"alertname": alert_name, "alertstate": alert_state}) cmd += ",".join( [f"{name}=\"{value}\"" for name, value in labels.items()]) cmd += f"}}[{time_range}]' 2>/dev/null" out = kubectl.launch(cmd) out = json.loads(out) if not ("status" in out and out["status"] == "success"): fail("wrong response from prometheus query API") if len(out["data"]["result"]) == 0: with Then("not present, empty result"): return False result_labels = out["data"]["result"][0]["metric"].items() exists = all(item in result_labels for item in labels.items()) with Then("got result and contains labels" if exists else "got result, but doesn't contain labels"): return exists
def test_zookeeper_hardware_exceptions(): pod1, svc1, pod2, svc2 = random_pod_choice_for_callbacks() chi_name = chi["metadata"]["name"] def restart_zookeeper(): kubectl.launch( f"exec -n {kubectl.namespace} zookeeper-0 -- sh -c \"kill 1\"", ok_to_fail=True, ) clickhouse.query_with_error(chi_name, "SELECT name, path FROM system.zookeeper WHERE path='/'", host=svc1) clickhouse.query_with_error(chi_name, "SELECT name, path FROM system.zookeeper WHERE path='/'", host=svc2) with Then("check ClickHouseZooKeeperHardwareExceptions firing"): for svc in (svc1, svc2): fired = wait_alert_state("ClickHouseZooKeeperHardwareExceptions", "firing", True, labels={"hostname": svc}, time_range='40s', sleep_time=5, callback=restart_zookeeper) assert fired, error("can't get ClickHouseZooKeeperHardwareExceptions alert in firing state") kubectl.wait_pod_status("zookeeper-0", "Running", ns=kubectl.namespace) kubectl.wait_jsonpath("pod", "zookeeper-0", "{.status.containerStatuses[0].ready}", "true", ns=kubectl.namespace) with Then("check ClickHouseZooKeeperHardwareExceptions gone away"): for svc in (svc1, svc2): resolved = wait_alert_state("ClickHouseZooKeeperHardwareExceptions", "firing", False, labels={"hostname": svc}) assert resolved, error("can't check ClickHouseZooKeeperHardwareExceptions alert is gone away")
def test_016(): chi = "test-016-settings" create_and_check( "configs/test-016-settings.yaml", { "apply_templates": {settings.clickhouse_template}, "pod_count": 1, "do_not_delete": 1 }) with Then("Custom macro 'layer' should be available"): out = clickhouse_query( chi, query="select substitution from system.macros where macro='layer'") assert out == "01" with And("Custom macro 'test' should be available"): out = clickhouse_query( chi, query="select substitution from system.macros where macro='test'") assert out == "test" with And("dictGet() should work"): out = clickhouse_query( chi, query="select dictGet('one', 'one', toUInt64(0))") assert out == "0" with And("query_log should be disabled"): clickhouse_query(chi, query="system flush logs") out = clickhouse_query_with_error( chi, query="select count() from system.query_log") assert "doesn't exist" in out with And("max_memory_usage should be 7000000000"): out = clickhouse_query( chi, query= "select value from system.settings where name='max_memory_usage'") assert out == "7000000000" with And("test_usersd user should be available"): clickhouse_query(chi, query="select version()", user="******") with When("Update usersd settings"): start_time = kube_get_field("pod", f"chi-{chi}-default-0-0-0", ".status.startTime") create_and_check("configs/test-016-settings-2.yaml", {"do_not_delete": 1}) with Then("Wait 60 seconds for configmap changes to apply"): time.sleep(60) with Then("test_norestart user should be available"): clickhouse_query(chi, query="select version()", user="******") with And("ClickHouse should not be restarted"): new_start_time = kube_get_field("pod", f"chi-{chi}-default-0-0-0", ".status.startTime") assert start_time == new_start_time kube_delete_chi("test-016-settings")
def test_zookeeper_alerts(self): zookeeper_spec = kubectl.get("endpoints", "zookeeper") zookeeper_pod = random.choice( zookeeper_spec["subsets"][0]["addresses"])["targetRef"]["name"] def restart_zookeeper(): kubectl.launch( f"exec -n {kubectl.namespace} {zookeeper_pod} -- sh -c \"kill 1\"", ok_to_fail=True, ) def wait_when_zookeeper_up(): kubectl.wait_pod_status(zookeeper_pod, "Running", ns=kubectl.namespace) kubectl.wait_jsonpath("pod", zookeeper_pod, "{.status.containerStatuses[0].ready}", "true", ns=kubectl.namespace) with Then("check ZookeeperDown firing"): fired = alerts.wait_alert_state( "ZookeeperDown", "firing", True, labels={"pod_name": zookeeper_pod}, time_range='1m', sleep_time=settings.prometheus_scrape_interval, callback=restart_zookeeper) assert fired, error("can't get ZookeeperDown alert in firing state") wait_when_zookeeper_up() with Then("check ZookeeperDown gone away"): resolved = alerts.wait_alert_state("ZookeeperDown", "firing", False, labels={"pod_name": zookeeper_pod}) assert resolved, error("can't check ZookeeperDown alert is gone away") restart_zookeeper() with Then("check ZookeeperRestartRecently firing"): fired = alerts.wait_alert_state("ZookeeperRestartRecently", "firing", True, labels={"pod_name": zookeeper_pod}, time_range='30s') assert fired, error( "can't get ZookeeperRestartRecently alert in firing state") wait_when_zookeeper_up() with Then("check ZookeeperRestartRecently gone away"): resolved = alerts.wait_alert_state("ZookeeperRestartRecently", "firing", False, labels={"pod_name": zookeeper_pod}) assert resolved, error( "can't check ZookeeperRestartRecently alert is gone away")
def test_014(): require_zookeeper() create_table = """ create table t (a Int8) Engine = ReplicatedMergeTree('/clickhouse/{installation}/{cluster}/tables/{shard}/{database}/{table}', '{replica}') partition by tuple() order by a""".replace('\r', '').replace('\n', '') create_and_check( "configs/test-014-replication.yaml", { "apply_templates": {settings.clickhouse_template}, "object_counts": [2, 2, 3], "do_not_delete": 1 }) with Given("Table is created on a first replica and data is inserted"): clickhouse_query("test-014-replication", create_table, host="chi-test-014-replication-default-0-0") clickhouse_query("test-014-replication", "insert into t values(1)", host="chi-test-014-replication-default-0-0") with When("Table is created on the second replica"): clickhouse_query("test-014-replication", create_table, host="chi-test-014-replication-default-0-1") with Then("Data should be replicated"): out = clickhouse_query( "test-014-replication", "select a from t", host="chi-test-014-replication-default-0-1") assert out == "1" with When("Add one more replica"): create_and_check("configs/test-014-replication-2.yaml", { "pod_count": 3, "do_not_delete": 1 }) # that also works: # kubectl patch chi test-014-replication -n test --type=json -p '[{"op":"add", "path": "/spec/configuration/clusters/0/layout/shards/0/replicasCount", "value": 3}]' with Then("Replicated table should be automatically created"): out = clickhouse_query("test-014-replication", "select a from t", host="chi-test-014-replication-default-0-2") assert out == "1" with When("Remove replica"): create_and_check("configs/test-014-replication.yaml", { "pod_count": 1, "do_not_delete": 1 }) with Then("Replica needs to be removed from the Zookeeper as well"): out = clickhouse_query( "test-014-replication", "select count() from system.replicas where table='t'") assert out == "1" kube_delete_chi("test-014-replication")
def kube_wait_objects(chi, objects, ns="test"): with Then(f"{objects[0]} statefulsets, {objects[1]} pods and {objects[2]} services should be created"): for i in range(1,max_retries): counts = kube_count_resources(label = f"-l clickhouse.altinity.com/chi={chi}", ns = ns) if counts == objects: break with Then("Not ready. Wait for " + str(i*5) + " seconds"): time.sleep(i*5) assert counts == objects, error()
def kube_wait_object(type, name, label="", count = 1, ns="test", retries = max_retries): with Then(f"{count} {type}(s) {name} should be created"): for i in range(1,retries): counts = kube_get_count(type, ns = ns, name = name, label = label) if counts >= count: break with Then("Not ready. Wait for " + str(i*5) + " seconds"): time.sleep(i*5) assert counts >= count, error()
def wait_command(command, result, count=1, ns=namespace, retries=max_retries): with Then(f"{command} should return {result}"): for i in range(1, retries): res = launch(command, ok_to_fail=True, ns=ns) if res == result: break with Then("Not ready. Wait for " + str(i * 5) + " seconds"): time.sleep(i * 5) assert res == result, error()
def wait_jsonpath(kind, name, field, value, ns=namespace, retries=max_retries): with Then(f"{kind} {name} -o jsonpath={field} should be {value}"): for i in range(1, retries): cur_value = get_jsonpath(kind, name, field, ns) if cur_value == value: break with Then("Not ready. Wait for " + str(i * 5) + " seconds"): time.sleep(i * 5) assert cur_value == value, error()
def wait_object(kind, name, label="", count=1, ns=namespace, retries=max_retries, backoff = 5): with Then(f"{count} {kind}(s) {name} should be created"): for i in range(1, retries): cur_count = get_count(kind, ns=ns, name=name, label=label) if cur_count >= count: break with Then("Not ready. Wait for " + str(i * backoff) + " seconds"): time.sleep(i * backoff) assert cur_count >= count, error()
def kube_wait_field(object, name, field, value, ns="test", retries = max_retries): with Then(f"{object} {name} {field} should be {value}"): for i in range(1,retries): obj_status = kubectl(f"get {object} {name} -o=custom-columns=field:{field}", ns=ns).splitlines() if obj_status[1] == value: break with Then("Not ready. Wait for " + str(i*5) + " seconds"): time.sleep(i*5) assert obj_status[1] == value, error()
def test_backup_not_run(self): not_run_pod, _, _, _ = alerts.random_pod_choice_for_callbacks(chi) apply_fake_backup("prepare fake backup for time metric") with Then(f"wait {not_run_pod} ready"): kubectl.wait_field("pod", not_run_pod, ".spec.containers[1].image", "nginx:latest") kubectl.wait_field("pod", not_run_pod, ".status.containerStatuses[1].ready", "true") with Then(f"setup {not_run_pod} backup create end time"): kubectl.launch( f'exec {not_run_pod} -c clickhouse-backup -- bash -xc \'' 'echo "# HELP clickhouse_backup_last_create_finish Last backup create finish timestamp" > /usr/share/nginx/html/metrics && ' 'echo "# TYPE clickhouse_backup_last_create_finish gauge" >> /usr/share/nginx/html/metrics && ' f'echo "clickhouse_backup_last_create_finish {int((datetime.datetime.now() - datetime.timedelta(days=2)).timestamp())}" >> /usr/share/nginx/html/metrics ' '\'') fired = alerts.wait_alert_state( "ClickhouseBackupDoesntRunTooLong", "firing", expected_state=True, sleep_time=settings.prometheus_scrape_interval, labels={"pod_name": not_run_pod}, time_range='60s') assert fired, error( "can't get ClickhouseBackupDoesntRunTooLong alert in firing state") apply_normal_backup() backup_name = prepare_table_for_backup(not_run_pod) wait_backup_pod_ready_and_curl_installed(not_run_pod) with When('Backup is success'): exec_on_backup_container( not_run_pod, f'curl -X POST -sL "http://127.0.0.1:7171/backup/create?name={backup_name}"' ) wait_backup_command_status(not_run_pod, f'create {backup_name}', expected_status='success') exec_on_backup_container( not_run_pod, f'curl -X POST -sL "http://127.0.0.1:7171/backup/upload/{backup_name}"' ) wait_backup_command_status(not_run_pod, f'upload {backup_name}', expected_status='success') with Then("check ClickhouseBackupDoesntRunTooLong gone away"): resolved = alerts.wait_alert_state("ClickhouseBackupDoesntRunTooLong", "firing", expected_state=False, labels={"pod_name": not_run_pod}) assert resolved, error( "can't get ClickhouseBackupDoesntRunTooLong alert is gone away")