def reboot_metrics_exporter(): clickhouse_operator_pod = clickhouse_operator_spec["items"][0][ "metadata"]["name"] kubectl.launch( f"exec -n {settings.operator_namespace} {clickhouse_operator_pod} -c metrics-exporter -- reboot", ok_to_fail=True, )
def restart_zookeeper(): kubectl.launch( f"exec -n {kubectl.namespace} zookeeper-0 -- sh -c \"kill 1\"", ok_to_fail=True, ) clickhouse.query_with_error(chi_name, "SELECT name, path FROM system.zookeeper WHERE path='/'", host=svc1) clickhouse.query_with_error(chi_name, "SELECT name, path FROM system.zookeeper WHERE path='/'", host=svc2)
def make_too_many_connection(): long_cmd = "" for _ in range(120): port = random.choice(["8123", "3306", "9000"]) if port == "8123": # HTTPConnection metric increase after full parsing of HTTP Request, we can't provide pause between CONNECT and QUERY running # long_cmd += f"nc -vv 127.0.0.1 {port} <( printf \"POST / HTTP/1.1\\r\\nHost: 127.0.0.1:8123\\r\\nContent-Length: 34\\r\\n\\r\\nTEST\\r\\nTEST\\r\\nTEST\\r\\nTEST\\r\\nTEST\\r\\nTEST\");" long_cmd += 'wget -qO- "http://127.0.0.1:8123?query=SELECT sleepEachRow(1),number,now() FROM numbers(30)";' elif port == "9000": long_cmd += 'clickhouse-client --idle_connection_timeout 70 --receive_timeout 70 -q "SELECT sleepEachRow(1),number,now() FROM numbers(30)";' # elif port == "3306": # long_cmd += 'mysql -u default -h 127.0.0.1 -e "SELECT sleepEachRow(1),number, now() FROM numbers(30)";' else: long_cmd += f"printf \"1\\n1\" | nc -q 5 -i 30 -vv 127.0.0.1 {port};" nc_cmd = f"echo '{long_cmd} exit 0' | xargs --verbose -i'{{}}' --no-run-if-empty -d ';' -P 120 bash -c '{{}}' 1>/dev/null" with open("/tmp/nc_cmd.sh", "w") as f: f.write(nc_cmd) kubectl.launch( f"cp /tmp/nc_cmd.sh {too_many_connection_pod}:/tmp/nc_cmd.sh -c clickhouse" ) kubectl.launch( f"exec -n {kubectl.namespace} {too_many_connection_pod} -c clickhouse -- bash /tmp/nc_cmd.sh", timeout=600, )
def test_metrics_exporter_reboot(): def check_monitoring_chi(operator_namespace, operator_pod, expect_result, max_retries=10): with And(f"metrics-exporter /chi enpoint result should return {expect_result}"): for i in range(1, max_retries): # check /metrics for try to refresh monitored instances kubectl.launch( f"exec {operator_pod} -c metrics-exporter -- wget -O- -q http://127.0.0.1:8888/metrics", ns=operator_namespace ) # check /chi after refresh monitored instances out = kubectl.launch( f"exec {operator_pod} -c metrics-exporter -- wget -O- -q http://127.0.0.1:8888/chi", ns=operator_namespace ) out = json.loads(out) if out == expect_result: break with Then("Not ready. Wait for " + str(i * 5) + " seconds"): time.sleep(i * 5) assert out == expect_result, error() with Given("clickhouse-operator is installed"): kubectl.wait_field("pods", "-l app=clickhouse-operator", ".status.containerStatuses[*].ready", "true,true", ns=settings.operator_namespace) assert kubectl.get_count("pod", ns='--all-namespaces', label="-l app=clickhouse-operator") > 0, error() out = kubectl.launch("get pods -l app=clickhouse-operator", ns=settings.operator_namespace).splitlines()[1] operator_pod = re.split(r'[\t\r\n\s]+', out)[0] operator_namespace = settings.operator_namespace kubectl.delete_ns(kubectl.namespace) kubectl.create_ns(kubectl.namespace) check_monitoring_chi(operator_namespace, operator_pod, []) with And("created simple clickhouse installation"): config = util.get_full_path("../docs/chi-examples/01-simple-layout-01-1shard-1repl.yaml") kubectl.create_and_check( config=config, check={ "object_counts": { "statefulset": 1, "pod": 1, "service": 2, }, "do_not_delete": True, }) expected_chi = [{ "namespace": "test", "name": "simple-01", "hostnames": ["chi-simple-01-cluster-0-0.test.svc.cluster.local"] }] check_monitoring_chi(operator_namespace, operator_pod, expected_chi) with When("reboot metrics exporter"): kubectl.launch(f"exec -n {operator_namespace} {operator_pod} -c metrics-exporter -- reboot") time.sleep(15) kubectl.wait_field("pods", "-l app=clickhouse-operator", ".status.containerStatuses[*].ready", "true,true", ns=settings.operator_namespace) with Then("check metrics exporter still contains chi objects"): check_monitoring_chi(operator_namespace, operator_pod, expected_chi) kubectl.delete(config) check_monitoring_chi(operator_namespace, operator_pod, [])
def create_success_backup(): backup_name = backup_prefix + "-" + str(random.randint(1, 4096)) kubectl.launch( f"exec -n {settings.test_namespace} {backup_pod} -c clickhouse-backup -- curl -X POST -sL http://127.0.0.1:7171/backup/create?name={backup_name}", ) wait_backup_command_status(backup_pod, command_name=f'create {backup_name}', expected_status='success')
def restart_zookeeper(): kubectl.launch( f"exec -n {kubectl.namespace} zookeeper-0 -- sh -c \"kill 1\"", ok_to_fail=True, ) clickhouse.query_with_error( chi_name, "INSERT INTO default.test_repl VALUES(now(),rand())", host=read_only_svc)
def test_backup_not_run(self): not_run_pod, _, _, _ = alerts.random_pod_choice_for_callbacks(chi) apply_fake_backup("prepare fake backup for time metric") with Then(f"wait {not_run_pod} ready"): kubectl.wait_field("pod", not_run_pod, ".spec.containers[1].image", "nginx:latest") kubectl.wait_field("pod", not_run_pod, ".status.containerStatuses[1].ready", "true") with Then(f"setup {not_run_pod} backup create end time"): kubectl.launch( f'exec {not_run_pod} -c clickhouse-backup -- bash -xc \'' 'echo "# HELP clickhouse_backup_last_create_finish Last backup create finish timestamp" > /usr/share/nginx/html/metrics && ' 'echo "# TYPE clickhouse_backup_last_create_finish gauge" >> /usr/share/nginx/html/metrics && ' f'echo "clickhouse_backup_last_create_finish {int((datetime.datetime.now() - datetime.timedelta(days=2)).timestamp())}" >> /usr/share/nginx/html/metrics ' '\'') fired = alerts.wait_alert_state( "ClickhouseBackupDoesntRunTooLong", "firing", expected_state=True, sleep_time=settings.prometheus_scrape_interval, labels={"pod_name": not_run_pod}, time_range='60s') assert fired, error( "can't get ClickhouseBackupDoesntRunTooLong alert in firing state") apply_normal_backup() backup_name = prepare_table_for_backup(not_run_pod) wait_backup_pod_ready_and_curl_installed(not_run_pod) with When('Backup is success'): exec_on_backup_container( not_run_pod, f'curl -X POST -sL "http://127.0.0.1:7171/backup/create?name={backup_name}"' ) wait_backup_command_status(not_run_pod, f'create {backup_name}', expected_status='success') exec_on_backup_container( not_run_pod, f'curl -X POST -sL "http://127.0.0.1:7171/backup/upload/{backup_name}"' ) wait_backup_command_status(not_run_pod, f'upload {backup_name}', expected_status='success') with Then("check ClickhouseBackupDoesntRunTooLong gone away"): resolved = alerts.wait_alert_state("ClickhouseBackupDoesntRunTooLong", "firing", expected_state=False, labels={"pod_name": not_run_pod}) assert resolved, error( "can't get ClickhouseBackupDoesntRunTooLong alert is gone away")
def test_too_many_connections(self): too_many_connection_pod, too_many_connection_svc, _, _ = alerts.random_pod_choice_for_callbacks( chi) cmd = "export DEBIAN_FRONTEND=noninteractive; apt-get update; apt-get install -y netcat mysql-client" kubectl.launch( f"exec -n {kubectl.namespace} {too_many_connection_pod} -c clickhouse-pod -- bash -c \"{cmd}\"", timeout=120, ) def make_too_many_connection(): long_cmd = "" for _ in range(120): port = random.choice(["8123", "3306", "3306", "3306", "9000"]) if port == "8123": # HTTPConnection metric increase after full parsing of HTTP Request, we can't provide pause between CONNECT and QUERY running # long_cmd += f"nc -vv 127.0.0.1 {port} <( printf \"POST / HTTP/1.1\\r\\nHost: 127.0.0.1:8123\\r\\nContent-Length: 34\\r\\n\\r\\nTEST\\r\\nTEST\\r\\nTEST\\r\\nTEST\\r\\nTEST\\r\\nTEST\");" long_cmd += 'wget -qO- "http://127.0.0.1:8123?query=SELECT sleepEachRow(1),number,now() FROM numbers(30)";' elif port == "9000": long_cmd += 'clickhouse-client --send_logs_level information --idle_connection_timeout 70 --receive_timeout 70 -q "SELECT sleepEachRow(1),number,now() FROM numbers(30)";' # elif port == "3306": # long_cmd += 'mysql -u default -h 127.0.0.1 -e "SELECT sleepEachRow(1),number, now() FROM numbers(30)";' else: long_cmd += f"printf \"1\\n1\" | nc -q 5 -i 30 -vv 127.0.0.1 {port};" nc_cmd = f"echo '{long_cmd} whereis nc; exit 0' | xargs --verbose -i'{{}}' --no-run-if-empty -d ';' -P 120 bash -c '{{}}' 1>/dev/null" with open("/tmp/nc_cmd.sh", "w") as f: f.write(nc_cmd) kubectl.launch( f"cp /tmp/nc_cmd.sh {too_many_connection_pod}:/tmp/nc_cmd.sh -c clickhouse-pod" ) kubectl.launch( f"exec -n {kubectl.namespace} {too_many_connection_pod} -c clickhouse-pod -- bash /tmp/nc_cmd.sh", timeout=600, ) with Then("check ClickHouseTooManyConnections firing"): fired = alerts.wait_alert_state( "ClickHouseTooManyConnections", "firing", True, labels={"hostname": too_many_connection_svc}, time_range='90s', callback=make_too_many_connection) assert fired, error( "can't get ClickHouseTooManyConnections alert in firing state") with Then("check ClickHouseTooManyConnections gone away"): resolved = alerts.wait_alert_state( "ClickHouseTooManyConnections", "firing", False, labels={"hostname": too_many_connection_svc}) assert resolved, error( "can't check ClickHouseTooManyConnections alert is gone away")
def rewrite_dns_on_clickhouse_server(write_new=True): dns = new_dns if write_new else old_dns kubectl.launch( f"exec -n {kubectl.namespace} {clickhouse_pod} -c clickhouse -- bash -c \"printf \\\"{dns}\\\" > /etc/resolv.conf\"", ok_to_fail=False, ) kubectl.launch( f"exec -n {kubectl.namespace} {clickhouse_pod} -c clickhouse -- clickhouse-client --echo -mn -q \"SYSTEM DROP DNS CACHE; SELECT count() FROM cluster('all-sharded',system,metrics)\"", ok_to_fail=True, )
def run_queries_with_priority(): sql = "" for i in range(50): sql += f"SET priority={i % 20};SELECT uniq(number) FROM numbers(20000000):" cmd = f"echo \\\"{sql} SELECT 1\\\" | xargs -i'{{}}' --no-run-if-empty -d ':' -P 20 clickhouse-client --time -m -n -q \\\"{{}}\\\"" kubectl.launch(f"exec {priority_pod} -- bash -c \"{cmd}\"", timeout=120) clickhouse.query( chi["metadata"]["name"], "SELECT event_time, CurrentMetric_QueryPreempted FROM system.metric_log WHERE CurrentMetric_QueryPreempted > 0", host=priority_svc, )
def restart_clickhouse_and_insert_to_replicated_table(): with When(f"stop replica fetches on {stop_replica_svc}"): sql = "SYSTEM STOP FETCHES default.test_repl" kubectl.launch( f"exec -n {kubectl.namespace} {stop_replica_pod} -c clickhouse -- clickhouse-client -q \"{sql}\"", ok_to_fail=True, ) sql = "INSERT INTO default.test_repl SELECT now(), number FROM numbers(100000)" kubectl.launch( f"exec -n {kubectl.namespace} {insert_pod} -c clickhouse -- clickhouse-client -q \"{sql}\"", )
def test_too_much_running_queries(self): _, _, too_many_queries_pod, too_many_queries_svc = alerts.random_pod_choice_for_callbacks( chi) cmd = "export DEBIAN_FRONTEND=noninteractive; apt-get update; apt-get install -y mysql-client" kubectl.launch( f"exec -n {kubectl.namespace} {too_many_queries_pod} -c clickhouse-pod -- bash -c \"{cmd}\"", ok_to_fail=True, ) def make_too_many_queries(): long_cmd = "" for _ in range(90): port = random.choice(["8123", "3306", "9000"]) if port == "9000": long_cmd += 'clickhouse-client -q "SELECT sleepEachRow(1),now() FROM numbers(60)";' if port == "3306": long_cmd += 'mysql -h 127.0.0.1 -P 3306 -u default -e "SELECT sleepEachRow(1),now() FROM numbers(60)";' if port == "8123": long_cmd += 'wget -qO- "http://127.0.0.1:8123?query=SELECT sleepEachRow(1),now() FROM numbers(60)";' long_cmd = f"echo '{long_cmd}' | xargs --verbose -i'{{}}' --no-run-if-empty -d ';' -P 100 bash -c '{{}}' 1>/dev/null" with open("/tmp/long_cmd.sh", "w") as f: f.write(long_cmd) kubectl.launch( f"cp /tmp/long_cmd.sh {too_many_queries_pod}:/tmp/long_cmd.sh -c clickhouse-pod" ) kubectl.launch( f"exec -n {kubectl.namespace} {too_many_queries_pod} -c clickhouse-pod -- bash /tmp/long_cmd.sh", timeout=90, ) with Then("check ClickHouseTooManyRunningQueries firing"): fired = alerts.wait_alert_state( "ClickHouseTooManyRunningQueries", "firing", True, labels={"hostname": too_many_queries_svc}, callback=make_too_many_queries, time_range="30s") assert fired, error( "can't get ClickHouseTooManyConnections alert in firing state") with Then("check ClickHouseTooManyConnections gone away"): resolved = alerts.wait_alert_state( "ClickHouseTooManyRunningQueries", "firing", False, labels={"hostname": too_many_queries_svc}, sleep_time=settings.prometheus_scrape_interval) assert resolved, error( "can't check ClickHouseTooManyConnections alert is gone away")
def create_fail_backup(): backup_name = backup_prefix + "-" + str(random.randint(1, 4096)) backup_dir = f"/var/lib/clickhouse/backup/{backup_name}/shadow/default/test_backup" kubectl.launch( f"exec -n {settings.test_namespace} {backup_pod} -c clickhouse-backup -- bash -c 'mkdir -v -m 0400 -p {backup_dir}'", ) kubectl.launch( f"exec -n {settings.test_namespace} {backup_pod} -c clickhouse-backup -- curl -X POST -sL http://127.0.0.1:7171/backup/create?name={backup_name}", ) wait_backup_command_status(backup_pod, command_name=f'create {backup_name}', expected_status='error')
def restart_operator(ns=settings.operator_namespace, timeout=60): pod_name = kubectl.get( "pod", name="", ns=ns, label="-l app=clickhouse-operator")["items"][0]["metadata"]["name"] kubectl.launch(f"delete pod {pod_name}", ns=ns, timeout=timeout) kubectl.wait_object("pod", name="", ns=ns, label="-l app=clickhouse-operator") pod_name = kubectl.get( "pod", name="", ns=ns, label="-l app=clickhouse-operator")["items"][0]["metadata"]["name"] kubectl.wait_pod_status(pod_name, "Running", ns=ns)
def reboot_clickhouse_and_distributed_exection(): # we need 70 delayed files for catch insert_sql = 'INSERT INTO default.test_distr(event_time, test) SELECT now(), number FROM system.numbers LIMIT 10000' select_sql = 'SELECT count() FROM default.test_distr' with Then("reboot clickhouse-server pod"): kubectl.launch( f"exec -n {kubectl.namespace} {restarted_pod} -c clickhouse -- kill 1", ok_to_fail=True, ) with And("Insert to distributed table"): clickhouse.query(chi["metadata"]["name"], insert_sql, host=delayed_pod, ns=kubectl.namespace) with And("Select from distributed table"): clickhouse.query_with_error(chi["metadata"]["name"], select_sql, host=delayed_pod, ns=kubectl.namespace)
def test_distributed_files_to_insert(): delayed_pod, delayed_svc, restarted_pod, restarted_svc = random_pod_choice_for_callbacks() create_distributed_table_on_cluster() insert_sql = 'INSERT INTO default.test_distr(event_time, test) SELECT now(), number FROM system.numbers LIMIT 1000' clickhouse.query( chi["metadata"]["name"], 'SYSTEM STOP DISTRIBUTED SENDS default.test_distr', pod=delayed_pod, ns=kubectl.namespace ) files_to_insert_from_metrics = 0 files_to_insert_from_disk = 0 tries = 0 # we need more than 50 delayed files for catch while files_to_insert_from_disk <= 55 and files_to_insert_from_metrics <= 55 and tries < 500: kubectl.launch( f"exec -n {kubectl.namespace} {restarted_pod} -c clickhouse -- kill 1", ok_to_fail=True, ) clickhouse.query(chi["metadata"]["name"], insert_sql, pod=delayed_pod, host=delayed_pod, ns=kubectl.namespace) files_to_insert_from_metrics = clickhouse.query( chi["metadata"]["name"], "SELECT value FROM system.metrics WHERE metric='DistributedFilesToInsert'", pod=delayed_pod, ns=kubectl.namespace ) files_to_insert_from_metrics = int(files_to_insert_from_metrics) files_to_insert_from_disk = int(kubectl.launch( f"exec -n {kubectl.namespace} {delayed_pod} -c clickhouse -- bash -c 'ls -la /var/lib/clickhouse/data/default/test_distr/*/*.bin 2>/dev/null | wc -l'", ok_to_fail=False, )) with When("reboot clickhouse-server pod"): fired = wait_alert_state("ClickHouseDistributedFilesToInsertHigh", "firing", True, labels={"hostname": delayed_svc, "chi": chi["metadata"]["name"]}) assert fired, error("can't get ClickHouseDistributedFilesToInsertHigh alert in firing state") kubectl.wait_pod_status(restarted_pod, "Running", ns=kubectl.namespace) clickhouse.query( chi["metadata"]["name"], 'SYSTEM START DISTRIBUTED SENDS default.test_distr', pod=delayed_pod, ns=kubectl.namespace ) with Then("check ClickHouseDistributedFilesToInsertHigh gone away"): resolved = wait_alert_state("ClickHouseDistributedFilesToInsertHigh", "firing", False, labels={"hostname": delayed_svc}) assert resolved, error("can't check ClickHouseDistributedFilesToInsertHigh alert is gone away") drop_distributed_table_on_cluster()
def test_clickhouse_dns_errors(): random_idx = random.randint(0, 1) clickhouse_pod = chi["status"]["pods"][random_idx] clickhouse_svc = chi["status"]["fqdns"][random_idx] old_dns = kubectl.launch( f"exec -n {kubectl.namespace} {clickhouse_pod} -c clickhouse -- cat /etc/resolv.conf", ok_to_fail=False, ) new_dns = re.sub(r'^nameserver (.+)', 'nameserver 1.1.1.1', old_dns) def rewrite_dns_on_clickhouse_server(write_new=True): dns = new_dns if write_new else old_dns kubectl.launch( f"exec -n {kubectl.namespace} {clickhouse_pod} -c clickhouse -- bash -c \"printf \\\"{dns}\\\" > /etc/resolv.conf\"", ok_to_fail=False, ) kubectl.launch( f"exec -n {kubectl.namespace} {clickhouse_pod} -c clickhouse -- clickhouse-client --echo -mn -q \"SYSTEM DROP DNS CACHE; SELECT count() FROM cluster('all-sharded',system,metrics)\"", ok_to_fail=True, ) with When("rewrite /etc/resolv.conf in clickhouse-server pod"): fired = wait_alert_state("ClickHouseDNSErrors", "firing", True, labels={"hostname": clickhouse_svc}, time_range='20s', callback=rewrite_dns_on_clickhouse_server, sleep_time=5) assert fired, error("can't get ClickHouseDNSErrors alert in firing state") with Then("check ClickHouseDNSErrors gone away"): rewrite_dns_on_clickhouse_server(write_new=False) resolved = wait_alert_state("ClickHouseDNSErrors", "firing", False, labels={"hostname": clickhouse_svc}) assert resolved, error("can't check ClickHouseDNSErrors alert is gone away")
def check_alert_state(alert_name, prometheus_pod, alert_state="firing", labels=None, time_range="10s"): with Then( f"check {alert_name} for state {alert_state} and {labels} labels in {time_range}" ): cmd = f"exec -n {settings.prometheus_namespace} {prometheus_pod} -c prometheus -- " cmd += "wget -qO- 'http://127.0.0.1:9090/api/v1/query?query=ALERTS{" if labels is None: labels = {} if not isinstance(labels, dict): fail(f"Invalid labels={labels}") labels.update({"alertname": alert_name, "alertstate": alert_state}) cmd += ",".join( [f"{name}=\"{value}\"" for name, value in labels.items()]) cmd += f"}}[{time_range}]' 2>/dev/null" out = kubectl.launch(cmd) out = json.loads(out) if not ("status" in out and out["status"] == "success"): fail("wrong response from prometheus query API") if len(out["data"]["result"]) == 0: with Then("not present, empty result"): return False result_labels = out["data"]["result"][0]["metric"].items() exists = all(item in result_labels for item in labels.items()) with Then("got result and contains labels" if exists else "got result, but doesn't contain labels"): return exists
def test_metrics_exporter_with_multiple_clickhouse_version(): def check_monitoring_metrics(operator_namespace, operator_pod, expect_result, max_retries=10): with And(f"metrics-exporter /metrics enpoint result should match with {expect_result}"): for i in range(1, max_retries): out = kubectl.launch( f"exec {operator_pod} -c metrics-exporter -- wget -O- -q http://127.0.0.1:8888/metrics", ns=operator_namespace ) all_strings_expected_done = True for string, exists in expect_result.items(): all_strings_expected_done = (exists == (string in out)) if not all_strings_expected_done: break if all_strings_expected_done: break with Then("Not ready. Wait for " + str(i * 5) + " seconds"): time.sleep(i * 5) assert all_strings_expected_done, error() with Given("clickhouse-operator pod exists"): out = kubectl.launch("get pods -l app=clickhouse-operator", ns='kube-system').splitlines()[1] operator_pod = re.split(r'[\t\r\n\s]+', out)[0] operator_namespace = "kube-system" with Then("check empty /metrics"): kubectl.delete_ns(kubectl.namespace, ok_to_fail=True) kubectl.create_ns(kubectl.namespace) check_monitoring_metrics(operator_namespace, operator_pod, expect_result={ 'chi_clickhouse_metric_VersionInteger': False, }) with Then("Install multiple clickhouse version"): config = util.get_full_path("configs/test-017-multi-version.yaml") kubectl.create_and_check( config=config, check={ "object_counts": { "statefulset": 4, "pod": 4, "service": 5, }, "do_not_delete": True, }) with And("Check not empty /metrics"): check_monitoring_metrics(operator_namespace, operator_pod, expect_result={ '# HELP chi_clickhouse_metric_VersionInteger': True, '# TYPE chi_clickhouse_metric_VersionInteger gauge': True, 'chi_clickhouse_metric_VersionInteger{chi="test-017-multi-version",hostname="chi-test-017-multi-version-default-0-0': True, 'chi_clickhouse_metric_VersionInteger{chi="test-017-multi-version",hostname="chi-test-017-multi-version-default-1-0': True, 'chi_clickhouse_metric_VersionInteger{chi="test-017-multi-version",hostname="chi-test-017-multi-version-default-2-0': True, 'chi_clickhouse_metric_VersionInteger{chi="test-017-multi-version",hostname="chi-test-017-multi-version-default-3-0': True, }) with Then("check empty /metrics after delete namespace"): kubectl.delete_ns(kubectl.namespace) check_monitoring_metrics(operator_namespace, operator_pod, expect_result={ 'chi_clickhouse_metric_VersionInteger': False, })
def exec_on_backup_container(backup_pod, cmd, ns=settings.test_namespace, ok_to_fail=False, timeout=60, container='clickhouse-backup'): return kubectl.launch(f'exec -n {ns} {backup_pod} -c {container} -- {cmd}', ok_to_fail=ok_to_fail, timeout=timeout)
def query( chi_name, sql, with_error=False, host="127.0.0.1", port="9000", user="", pwd="", ns=settings.test_namespace, timeout=60, advanced_params="", pod="", ): pod_names = kubectl.get_pod_names(chi_name, ns) pod_name = pod_names[0] for p in pod_names: if host in p or p == pod: pod_name = p break pwd_str = "" if pwd == "" else f"--password={pwd}" user_str = "" if user == "" else f"--user={user}" if with_error: return kubectl.launch( f"exec {pod_name}" f" --" f" clickhouse-client -mn -h {host} --port={port} {user_str} {pwd_str} {advanced_params}" f" --query=\"{sql}\"" f" 2>&1", timeout=timeout, ns=ns, ok_to_fail=True, ) else: return kubectl.launch( f"exec {pod_name} -n {ns}" f" -- " f"clickhouse-client -mn -h {host} --port={port} {user_str} {pwd_str} {advanced_params}" f"--query=\"{sql}\"", timeout=timeout, ns=ns, )
def check_monitoring_chi(operator_namespace, operator_pod, expect_result, max_retries=10): with Then(f"metrics-exporter /chi enpoint result should return {expect_result}"): for i in range(1, max_retries): # check /metrics for try to refresh monitored instances kubectl.launch( f"exec {operator_pod} -c metrics-exporter -- wget -O- -q http://127.0.0.1:8888/metrics", ns=operator_namespace ) # check /chi after refresh monitored instances out = kubectl.launch( f"exec {operator_pod} -c metrics-exporter -- wget -O- -q http://127.0.0.1:8888/chi", ns=operator_namespace ) out = json.loads(out) if out == expect_result: break with Then("Not ready. Wait for " + str(i * 5) + " seconds"): time.sleep(i * 5) assert out == expect_result, error()
def set_operator_version(version, ns=settings.operator_namespace, timeout=60): operator_image = f"{settings.operator_docker_repo}:{version}" metrics_exporter_image = f"{settings.metrics_exporter_docker_repo}:{version}" kubectl.launch(f"set image deployment.v1.apps/clickhouse-operator clickhouse-operator={operator_image}", ns=ns) kubectl.launch(f"set image deployment.v1.apps/clickhouse-operator metrics-exporter={metrics_exporter_image}", ns=ns) kubectl.launch("rollout status deployment.v1.apps/clickhouse-operator", ns=ns, timeout=timeout) assert kubectl.get_count("pod", ns=ns, label="-l app=clickhouse-operator") > 0, error()
def make_too_many_queries(): long_cmd = "" for _ in range(90): port = random.choice(["8123", "3306", "9000"]) if port == "9000": long_cmd += 'clickhouse-client -q "SELECT sleepEachRow(1),now() FROM numbers(60)";' if port == "3306": long_cmd += 'mysql -h 127.0.0.1 -P 3306 -u default -e "SELECT sleepEachRow(1),now() FROM numbers(60)";' if port == "8123": long_cmd += 'wget -qO- "http://127.0.0.1:8123?query=SELECT sleepEachRow(1),now() FROM numbers(60)";' long_cmd = f"echo '{long_cmd}' | xargs --verbose -i'{{}}' --no-run-if-empty -d ';' -P 100 bash -c '{{}}' 1>/dev/null" with open("/tmp/long_cmd.sh", "w") as f: f.write(long_cmd) kubectl.launch( f"cp /tmp/long_cmd.sh {too_many_queries_pod}:/tmp/long_cmd.sh -c clickhouse" ) kubectl.launch( f"exec -n {kubectl.namespace} {too_many_queries_pod} -c clickhouse -- bash /tmp/long_cmd.sh", timeout=70, )
def set_operator_version(version, ns=settings.operator_namespace, timeout=60): operator_image = f"{settings.operator_docker_repo}:{version}" metrics_exporter_image = f"{settings.metrics_exporter_docker_repo}:{version}" kubectl.launch(f"set image deployment.v1.apps/clickhouse-operator clickhouse-operator={operator_image}", ns=ns) kubectl.launch(f"set image deployment.v1.apps/clickhouse-operator metrics-exporter={metrics_exporter_image}", ns=ns) kubectl.launch("rollout status deployment.v1.apps/clickhouse-operator", ns=ns, timeout=timeout) if kubectl.get_count("pod", ns=ns, label=operator_label) == 0: fail("invalid clickhouse-operator pod count")
def check_monitoring_metrics(operator_namespace, operator_pod, expect_result, max_retries=10): with Then(f"metrics-exporter /metrics enpoint result should match with {expect_result}"): for i in range(1, max_retries): out = kubectl.launch( f"exec {operator_pod} -c metrics-exporter -- wget -O- -q http://127.0.0.1:8888/metrics", ns=operator_namespace ) all_strings_expected_done = True for string, exists in expect_result.items(): all_strings_expected_done = (exists == (string in out)) if not all_strings_expected_done: break if all_strings_expected_done: break with Then("Not ready. Wait for " + str(i * 5) + " seconds"): time.sleep(i * 5) assert all_strings_expected_done, error()
def test_ch_001(self): util.require_zookeeper() chit_data = manifest.get_chit_data( util.get_full_path("templates/tpl-clickhouse-19.11.yaml")) kubectl.launch(f"delete chit {chit_data['metadata']['name']}", ns=settings.test_namespace) kubectl.create_and_check( "configs/test-ch-001-insert-quorum.yaml", { "apply_templates": {"templates/tpl-clickhouse-20.8.yaml"}, "pod_count": 2, "do_not_delete": 1, }) chi = manifest.get_chi_name( util.get_full_path("configs/test-ch-001-insert-quorum.yaml")) chi_data = kubectl.get("chi", ns=settings.test_namespace, name=chi) util.wait_clickhouse_cluster_ready(chi_data) host0 = "chi-test-ch-001-insert-quorum-default-0-0" host1 = "chi-test-ch-001-insert-quorum-default-0-1" create_table = """ create table t1 on cluster default (a Int8, d Date default today()) Engine = ReplicatedMergeTree('/clickhouse/tables/{table}', '{replica}') partition by d order by a TTL d + interval 5 second SETTINGS merge_with_ttl_timeout=5""".replace('\r', '').replace('\n', '') create_mv_table2 = """ create table t2 on cluster default (a Int8) Engine = ReplicatedMergeTree('/clickhouse/tables/{table}', '{replica}') partition by tuple() order by a""".replace('\r', '').replace('\n', '') create_mv_table3 = """ create table t3 on cluster default (a Int8) Engine = ReplicatedMergeTree('/clickhouse/tables/{table}', '{replica}') partition by tuple() order by a""".replace('\r', '').replace('\n', '') create_mv2 = "create materialized view t_mv2 on cluster default to t2 as select a from t1" create_mv3 = "create materialized view t_mv3 on cluster default to t3 as select a from t1" with Given("Tables t1, t2, t3 and MVs t1->t2, t1-t3 are created"): clickhouse.query(chi, create_table) clickhouse.query(chi, create_mv_table2) clickhouse.query(chi, create_mv_table3) clickhouse.query(chi, create_mv2) clickhouse.query(chi, create_mv3) with When("Add a row to an old partition"): clickhouse.query(chi, "insert into t1(a,d) values(6, today()-1)", host=host0) with When("Stop fetches for t1 at replica1"): clickhouse.query(chi, "system stop fetches default.t1", host=host1) with Then("Wait 10 seconds and the data should be dropped by TTL"): time.sleep(10) out = clickhouse.query(chi, "select count() from t1 where a=6", host=host0) assert out == "0" with When("Resume fetches for t1 at replica1"): clickhouse.query(chi, "system start fetches default.t1", host=host1) time.sleep(5) with Then("Inserts should resume"): clickhouse.query(chi, "insert into t1(a) values(7)", host=host0) clickhouse.query(chi, "insert into t1(a) values(1)") with When("Stop fetches for t2 at replica1"): clickhouse.query(chi, "system stop fetches default.t2", host=host1) with Then("Insert should fail since it can not reach the quorum"): out = clickhouse.query_with_error( chi, "insert into t1(a) values(2)", host=host0) assert "Timeout while waiting for quorum" in out # kubectl(f"exec {host0}-0 -n test -- cp /var/lib//clickhouse/data/default/t2/all_1_1_0/a.mrk2 /var/lib//clickhouse/data/default/t2/all_1_1_0/a.bin") # with Then("Corrupt data part in t2"): # kubectl(f"exec {host0}-0 -n test -- sed -i \"s/b/c/\" /var/lib/clickhouse/data/default/t2/all_1_1_0/a.bin") with When("Resume fetches for t2 at replica1"): clickhouse.query(chi, "system start fetches default.t2", host=host1) i = 0 while "2" != clickhouse.query( chi, "select active_replicas from system.replicas where database='default' and table='t1'", pod=host0) and i < 10: with Then("Not ready, wait 5 seconds"): time.sleep(5) i += 1 with Then( "Inserts should fail with an error regarding not satisfied quorum" ): out = clickhouse.query_with_error( chi, "insert into t1(a) values(3)", host=host0) assert "Quorum for previous write has not been satisfied yet" in out with And("Second insert of the same block should pass"): clickhouse.query(chi, "insert into t1(a) values(3)", host=host0) with And("Insert of the new block should fail"): out = clickhouse.query_with_error( chi, "insert into t1(a) values(4)", host=host0) assert "Quorum for previous write has not been satisfied yet" in out with And( "Second insert of the same block with 'deduplicate_blocks_in_dependent_materialized_views' setting should fail" ): out = clickhouse.query_with_error( chi, "set deduplicate_blocks_in_dependent_materialized_views=1; insert into t1(a) values(5)", host=host0) assert "Quorum for previous write has not been satisfied yet" in out out = clickhouse.query_with_error( chi, "select t1.a t1_a, t2.a t2_a from t1 left outer join t2 using (a) order by t1_a settings join_use_nulls=1" ) print(out)
def restart_zookeeper(): kubectl.launch( f"exec -n {kubectl.namespace} {zookeeper_pod} -- sh -c \"kill 1\"", ok_to_fail=True, )
def set_metrics_exporter_version(version, ns=settings.operator_namespace): kubectl.launch(f"set image deployment.v1.apps/clickhouse-operator metrics-exporter=altinity/metrics-exporter:{version}", ns=ns) kubectl.launch("rollout status deployment.v1.apps/clickhouse-operator", ns=ns)
def reboot_clickhouse_server(): kubectl.launch( f"exec -n {kubectl.namespace} {clickhouse_pod} -c clickhouse -- kill 1", ok_to_fail=True, )