def test_backup_not_run(self, chi, minio_spec):
    not_run_pod, _, _, _ = alerts.random_pod_choice_for_callbacks(chi)
    apply_fake_backup("prepare fake backup for time metric")

    with Then(f"wait {not_run_pod} ready"):
        kubectl.wait_field("pod", not_run_pod, ".spec.containers[1].image",
                           "nginx:latest")
        kubectl.wait_field("pod", not_run_pod,
                           ".status.containerStatuses[1].ready", "true")

    with Then(f"setup {not_run_pod} backup create end time"):
        kubectl.launch(
            f'exec {not_run_pod} -c clickhouse-backup -- bash -xc \''
            'echo "# HELP clickhouse_backup_last_create_finish Last backup create finish timestamp" > /usr/share/nginx/html/metrics && '
            'echo "# TYPE clickhouse_backup_last_create_finish gauge" >> /usr/share/nginx/html/metrics && '
            f'echo "clickhouse_backup_last_create_finish {int((datetime.datetime.now() - datetime.timedelta(days=2)).timestamp())}" >> /usr/share/nginx/html/metrics '
            '\'')

        fired = alerts.wait_alert_state(
            "ClickhouseBackupDoesntRunTooLong",
            "firing",
            expected_state=True,
            sleep_time=settings.prometheus_scrape_interval,
            labels={"pod_name": not_run_pod},
            time_range='60s')
        assert fired, error(
            "can't get ClickhouseBackupDoesntRunTooLong alert in firing state")

    apply_normal_backup()

    backup_name = prepare_table_for_backup(not_run_pod, chi)
    wait_backup_pod_ready_and_curl_installed(not_run_pod)

    with When('Backup is success'):
        exec_on_backup_container(
            not_run_pod,
            f'curl -X POST -sL "http://127.0.0.1:7171/backup/create?name={backup_name}"'
        )
        wait_backup_command_status(not_run_pod,
                                   f'create {backup_name}',
                                   expected_status='success')

        exec_on_backup_container(
            not_run_pod,
            f'curl -X POST -sL "http://127.0.0.1:7171/backup/upload/{backup_name}"'
        )
        wait_backup_command_status(not_run_pod,
                                   f'upload {backup_name}',
                                   expected_status='success')

    with Then("check ClickhouseBackupDoesntRunTooLong gone away"):
        resolved = alerts.wait_alert_state("ClickhouseBackupDoesntRunTooLong",
                                           "firing",
                                           expected_state=False,
                                           labels={"pod_name": not_run_pod})
        assert resolved, error(
            "can't get ClickhouseBackupDoesntRunTooLong alert is gone away")
def test_longest_running_query(self, prometheus_operator_spec, clickhouse_operator_spec, chi):
    long_running_pod, long_running_svc, _, _ = alerts.random_pod_choice_for_callbacks(chi)
    # 600s trigger + 2*30s - double prometheus scraping interval
    clickhouse.query(chi["metadata"]["name"], "SELECT now(),sleepEachRow(1),number FROM system.numbers LIMIT 660",
                     host=long_running_svc, timeout=670)
    with Then("check ClickHouseLongestRunningQuery firing"):
        fired = alerts.wait_alert_state("ClickHouseLongestRunningQuery", "firing", True, labels={"hostname": long_running_svc},
                                 time_range='30s')
        assert fired, error("can't get ClickHouseLongestRunningQuery alert in firing state")
    with Then("check ClickHouseLongestRunningQuery gone away"):
        resolved = alerts.wait_alert_state("ClickHouseLongestRunningQuery", "firing", False, labels={"hostname": long_running_svc})
        assert resolved, error("can't check ClickHouseLongestRunningQuery alert is gone away")
def test_backup_size(self, chi, minio_spec):
    decrease_pod, _, increase_pod, _ = alerts.random_pod_choice_for_callbacks(
        chi)

    backup_cases = {
        decrease_pod: {
            'rows': (10000, 1000),
            'decrease': True,
        },
        increase_pod: {
            'rows': (1000, 10000),
            'decrease': False,
        },
    }
    for backup_pod in backup_cases:
        decrease = backup_cases[backup_pod]['decrease']
        for backup_rows in backup_cases[backup_pod]['rows']:
            backup_name = prepare_table_for_backup(backup_pod,
                                                   chi,
                                                   rows=backup_rows)
            exec_on_backup_container(
                backup_pod,
                f'curl -X POST -sL "http://127.0.0.1:7171/backup/create?name={backup_name}"'
            )
            wait_backup_command_status(backup_pod,
                                       f'create {backup_name}',
                                       expected_status='success')
            if decrease:
                clickhouse.query(chi['metadata']['name'],
                                 f"TRUNCATE TABLE default.test_backup",
                                 pod=backup_pod)
            time.sleep(15)
        fired = alerts.wait_alert_state(
            "ClickHouseBackupSizeChanged",
            "firing",
            expected_state=True,
            sleep_time=settings.prometheus_scrape_interval,
            labels={"pod_name": backup_pod},
            time_range='60s')
        assert fired, error(
            f"can't get ClickHouseBackupSizeChanged alert in firing state, decrease={decrease}"
        )

        with Then("check ClickHouseBackupSizeChanged gone away"):
            resolved = alerts.wait_alert_state("ClickHouseBackupSizeChanged",
                                               "firing",
                                               expected_state=False,
                                               labels={"pod_name": backup_pod})
            assert resolved, error(
                f"can't get ClickHouseBackupSizeChanged alert is gone away, decrease={decrease}"
            )
def test_version_changed(self, prometheus_operator_spec, clickhouse_operator_spec, chi):
    changed_pod, changed_svc, _, _ = alerts.random_pod_choice_for_callbacks(chi)

    with When("apply changed settings"):
        kubectl.create_and_check(
            manifest="manifests/chi/test-cluster-for-alerts-changed-settings.yaml",
            check={
                "apply_templates": [
                    "manifests/chit/tpl-clickhouse-stable.yaml",
                    "manifests/chit/tpl-persistent-volume-100Mi.yaml"
                ],
                "object_counts": {
                    "statefulset": 2,
                    "pod": 2,
                    "service": 3,
                },
                "do_not_delete": 1
            }
        )
        prometheus_scrape_interval = 15
        with Then(f"wait prometheus_scrape_interval={prometheus_scrape_interval}*2 sec"):
            time.sleep(prometheus_scrape_interval * 2)

    with Then("check ClickHouseVersionChanged firing"):
        fired = alerts.wait_alert_state(
            "ClickHouseVersionChanged", "firing", True, labels={"hostname": changed_svc}, time_range="30s", sleep_time=settings.prometheus_scrape_interval
        )
        assert fired, error("can't get ClickHouseVersionChanged alert in firing state")

    with When("rollback changed settings"):
        kubectl.create_and_check(
            manifest="manifests/chi/test-cluster-for-alerts.yaml",
            check={
                "apply_templates": [
                    "manifests/chit/tpl-clickhouse-latest.yaml",
                    "manifests/chit/tpl-clickhouse-alerts.yaml",
                    "manifests/chit/tpl-persistent-volume-100Mi.yaml"
                ],
                "object_counts": {
                    "statefulset": 2,
                    "pod": 2,
                    "service": 3,
                },
                "do_not_delete": 1
            }
        )

    with Then("check ClickHouseVersionChanged gone away"):
        resolved = alerts.wait_alert_state("ClickHouseVersionChanged", "firing", False, labels={"hostname": changed_svc}, sleep_time=30)
        assert resolved, error("can't check ClickHouseVersionChanged alert is gone away")
def test_read_only_replica(self, prometheus_operator_spec, clickhouse_operator_spec, chi):
    read_only_pod, read_only_svc, other_pod, other_svc = alerts.random_pod_choice_for_callbacks(chi)
    chi_name = chi["metadata"]["name"]
    clickhouse.create_table_on_cluster(
        chi,
        'all-replicated', 'default.test_repl',
        '(event_time DateTime, test UInt64) ' +
        'ENGINE ReplicatedMergeTree(\'/clickhouse/tables/{installation}-{shard}/test_repl\', \'{replica}\') ORDER BY tuple()'
    )

    def restart_zookeeper():
        kubectl.launch(
            f"exec -n {kubectl.namespace} zookeeper-0 -- sh -c \"kill 1\"",
            ok_to_fail=True,
        )
        clickhouse.query_with_error(chi_name, "INSERT INTO default.test_repl VALUES(now(),rand())", host=read_only_svc)

    with Then("check ClickHouseReadonlyReplica firing"):
        fired = alerts.wait_alert_state("ClickHouseReadonlyReplica", "firing", True, labels={"hostname": read_only_svc},
                                        time_range='30s', sleep_time=settings.prometheus_scrape_interval, callback=restart_zookeeper)
        assert fired, error("can't get ClickHouseReadonlyReplica alert in firing state")
    with Then("check ClickHouseReadonlyReplica gone away"):
        resolved = alerts.wait_alert_state("ClickHouseReadonlyReplica", "firing", False, labels={"hostname": read_only_svc})
        assert resolved, error("can't check ClickHouseReadonlyReplica alert is gone away")

    kubectl.wait_pod_status("zookeeper-0", "Running", ns=kubectl.namespace)
    kubectl.wait_jsonpath("pod", "zookeeper-0", "{.status.containerStatuses[0].ready}", "true",
                          ns=kubectl.namespace)

    for i in range(11):
        zookeeper_status = kubectl.launch(
            f"exec -n {kubectl.namespace} zookeeper-0 -- sh -c \"echo ruok | nc 127.0.0.1 2181\"", ok_to_fail=True
        )
        if "imok" in zookeeper_status:
            break
        elif i == 10:
            fail(f"invalid zookeeper status after {i} retries")
        with Then("zookeper is not ready, wait 2 seconds"):
            time.sleep(2)

    clickhouse.query_with_error(
        chi_name, "SYSTEM RESTART REPLICAS; SYSTEM SYNC REPLICA default.test_repl",
        host=read_only_svc, timeout=240
    )
    clickhouse.query_with_error(
        chi_name, "SYSTEM RESTART REPLICAS; SYSTEM SYNC REPLICA default.test_repl",
        host=other_svc, timeout=240
    )

    clickhouse.drop_table_on_cluster(chi, 'all-replicated', 'default.test_repl')
def test_distributed_files_to_insert(self, prometheus_operator_spec, clickhouse_operator_spec, chi):
    delayed_pod, delayed_svc, restarted_pod, restarted_svc = alerts.random_pod_choice_for_callbacks(chi)
    clickhouse.create_distributed_table_on_cluster(chi)

    insert_sql = 'INSERT INTO default.test_distr(event_time, test) SELECT now(), number FROM system.numbers LIMIT 1000'
    clickhouse.query(
        chi["metadata"]["name"], 'SYSTEM STOP DISTRIBUTED SENDS default.test_distr',
        pod=delayed_pod, ns=kubectl.namespace
    )

    files_to_insert_from_metrics = 0
    files_to_insert_from_disk = 0
    tries = 0
    # we need more than 50 delayed files for catch
    while files_to_insert_from_disk <= 55 and files_to_insert_from_metrics <= 55 and tries < 500:
        kubectl.launch(
            f"exec -n {kubectl.namespace} {restarted_pod} -c clickhouse-pod -- kill 1",
            ok_to_fail=True,
        )
        clickhouse.query(chi["metadata"]["name"], insert_sql, pod=delayed_pod, host=delayed_pod, ns=kubectl.namespace)
        files_to_insert_from_metrics = clickhouse.query(
            chi["metadata"]["name"], "SELECT value FROM system.metrics WHERE metric='DistributedFilesToInsert'",
            pod=delayed_pod, ns=kubectl.namespace
        )
        files_to_insert_from_metrics = int(files_to_insert_from_metrics)

        files_to_insert_from_disk = int(kubectl.launch(
            f"exec -n {kubectl.namespace} {delayed_pod} -c clickhouse-pod -- bash -c 'ls -la /var/lib/clickhouse/data/default/test_distr/*/*.bin 2>/dev/null | wc -l'",
            ok_to_fail=False,
        ))

    with When("reboot clickhouse-server pod"):
        fired = alerts.wait_alert_state(
            "ClickHouseDistributedFilesToInsertHigh", "firing", True,
            labels={"hostname": delayed_svc, "chi": chi["metadata"]["name"]}
        )
        assert fired, error("can't get ClickHouseDistributedFilesToInsertHigh alert in firing state")

    kubectl.wait_pod_status(restarted_pod, "Running", ns=kubectl.namespace)

    clickhouse.query(
        chi["metadata"]["name"], 'SYSTEM START DISTRIBUTED SENDS default.test_distr',
        pod=delayed_pod, ns=kubectl.namespace
    )

    with Then("check ClickHouseDistributedFilesToInsertHigh gone away"):
        resolved = alerts.wait_alert_state("ClickHouseDistributedFilesToInsertHigh", "firing", False, labels={"hostname": delayed_svc})
        assert resolved, error("can't check ClickHouseDistributedFilesToInsertHigh alert is gone away")

    clickhouse.drop_distributed_table_on_cluster(chi)
def test_backup_is_down(self, chi, minio_spec):
    reboot_pod, _, _, _ = alerts.random_pod_choice_for_callbacks(chi)

    def reboot_backup_container():
        kubectl.launch(
            f"exec -n {settings.test_namespace} {reboot_pod} -c clickhouse-backup -- kill 1",
            ok_to_fail=True,
        )

    with When("reboot clickhouse-backup"):
        fired = alerts.wait_alert_state("ClickHouseBackupDown",
                                        "firing",
                                        expected_state=True,
                                        callback=reboot_backup_container,
                                        labels={"pod_name": reboot_pod},
                                        time_range='60s')
        assert fired, error(
            "can't get ClickHouseBackupDown alert in firing state")

    with Then("check ClickHouseBackupDown gone away"):
        resolved = alerts.wait_alert_state(
            "ClickHouseBackupDown",
            "firing",
            expected_state=False,
            sleep_time=settings.prometheus_scrape_interval,
            labels={"pod_name": reboot_pod},
        )
        assert resolved, error(
            "can't get ClickHouseBackupDown alert is gone away")

    with When("reboot clickhouse-backup again"):
        fired = alerts.wait_alert_state("ClickHouseBackupRecentlyRestart",
                                        "firing",
                                        expected_state=True,
                                        callback=reboot_backup_container,
                                        labels={"pod_name": reboot_pod},
                                        time_range='60s')
        assert fired, error(
            "can't get ClickHouseBackupRecentlyRestart alert in firing state")

    with Then("check ClickHouseBackupRecentlyRestart gone away"):
        resolved = alerts.wait_alert_state("ClickHouseBackupRecentlyRestart",
                                           "firing",
                                           expected_state=False,
                                           time_range='30s',
                                           labels={"pod_name": reboot_pod},
                                           sleep_time=30)
        assert resolved, error(
            "can't get ClickHouseBackupRecentlyRestart alert is gone away")
def test_backup_failed(self, chi, minio_spec):
    backup_pod, _, _, _ = alerts.random_pod_choice_for_callbacks(chi)
    backup_prefix = prepare_table_for_backup(backup_pod, chi)

    wait_backup_pod_ready_and_curl_installed(backup_pod)

    def create_fail_backup():
        backup_name = backup_prefix + "-" + str(random.randint(1, 4096))
        backup_dir = f"/var/lib/clickhouse/backup/{backup_name}/shadow/default/test_backup"
        kubectl.launch(
            f"exec -n {settings.test_namespace} {backup_pod} -c clickhouse-backup -- bash -c 'mkdir -v -m 0400 -p {backup_dir}'",
        )

        kubectl.launch(
            f"exec -n {settings.test_namespace} {backup_pod} -c clickhouse-backup -- curl -X POST -sL http://127.0.0.1:7171/backup/create?name={backup_name}",
        )
        wait_backup_command_status(backup_pod,
                                   command_name=f'create {backup_name}',
                                   expected_status='error')

    def create_success_backup():
        backup_name = backup_prefix + "-" + str(random.randint(1, 4096))
        kubectl.launch(
            f"exec -n {settings.test_namespace} {backup_pod} -c clickhouse-backup -- curl -X POST -sL http://127.0.0.1:7171/backup/create?name={backup_name}",
        )
        wait_backup_command_status(backup_pod,
                                   command_name=f'create {backup_name}',
                                   expected_status='success')

    with When("clickhouse-backup create failed"):
        fired = alerts.wait_alert_state("ClickHouseBackupFailed",
                                        "firing",
                                        expected_state=True,
                                        callback=create_fail_backup,
                                        labels={"pod_name": backup_pod},
                                        time_range='60s')
        assert fired, error(
            "can't get ClickHouseBackupFailed alert in firing state")

    with Then("check ClickHouseBackupFailed gone away"):
        resolved = alerts.wait_alert_state("ClickHouseBackupFailed",
                                           "firing",
                                           expected_state=False,
                                           callback=create_success_backup,
                                           labels={"pod_name": backup_pod},
                                           sleep_time=30)
        assert resolved, error(
            "can't get ClickHouseBackupFailed alert is gone away")
def test_too_many_connections(self, prometheus_operator_spec, clickhouse_operator_spec, chi):
    too_many_connection_pod, too_many_connection_svc, _, _ = alerts.random_pod_choice_for_callbacks(chi)
    cmd = "export DEBIAN_FRONTEND=noninteractive; apt-get update; apt-get install -y netcat mysql-client"
    kubectl.launch(
        f"exec -n {kubectl.namespace} {too_many_connection_pod} -c clickhouse-pod -- bash -c  \"{cmd}\"",
        timeout=120,
    )

    def make_too_many_connection():
        long_cmd = ""
        for _ in range(120):
            port = random.choice(["8123", "3306", "3306", "3306", "9000"])
            if port == "8123":
                # HTTPConnection metric increase after full parsing of HTTP Request, we can't provide pause between CONNECT and QUERY running
                # long_cmd += f"nc -vv 127.0.0.1 {port} <( printf \"POST / HTTP/1.1\\r\\nHost: 127.0.0.1:8123\\r\\nContent-Length: 34\\r\\n\\r\\nTEST\\r\\nTEST\\r\\nTEST\\r\\nTEST\\r\\nTEST\\r\\nTEST\");"
                long_cmd += 'wget -qO- "http://127.0.0.1:8123?query=SELECT sleepEachRow(1),number,now() FROM numbers(30)";'
            elif port == "9000":
                long_cmd += 'clickhouse-client --send_logs_level information --idle_connection_timeout 70 --receive_timeout 70 -q "SELECT sleepEachRow(1),number,now() FROM numbers(30)";'
            # elif port == "3306":
            #     long_cmd += 'mysql -u default -h 127.0.0.1 -e "SELECT sleepEachRow(1),number, now() FROM numbers(30)";'
            else:
                long_cmd += f"printf \"1\\n1\" | nc -q 5 -i 30 -vv 127.0.0.1 {port};"

        nc_cmd = f"echo '{long_cmd} whereis nc; exit 0' | xargs --verbose -i'{{}}' --no-run-if-empty -d ';' -P 120 bash -c '{{}}' 1>/dev/null"
        with open("/tmp/nc_cmd.sh", "w") as f:
            f.write(nc_cmd)

        kubectl.launch(
            f"cp /tmp/nc_cmd.sh {too_many_connection_pod}:/tmp/nc_cmd.sh -c clickhouse-pod"
        )

        kubectl.launch(
            f"exec -n {kubectl.namespace} {too_many_connection_pod} -c clickhouse-pod -- bash /tmp/nc_cmd.sh",
            timeout=600,
        )

    with Then("check ClickHouseTooManyConnections firing"):
        fired = alerts.wait_alert_state(
            "ClickHouseTooManyConnections", "firing", True, labels={"hostname": too_many_connection_svc},
            time_range='90s', callback=make_too_many_connection
        )
        assert fired, error("can't get ClickHouseTooManyConnections alert in firing state")

    with Then("check ClickHouseTooManyConnections gone away"):
        resolved = alerts.wait_alert_state("ClickHouseTooManyConnections", "firing", False, labels={"hostname": too_many_connection_svc})
        assert resolved, error("can't check ClickHouseTooManyConnections alert is gone away")
def test_too_much_running_queries(self, prometheus_operator_spec, clickhouse_operator_spec, chi):
    _, _, too_many_queries_pod, too_many_queries_svc = alerts.random_pod_choice_for_callbacks(chi)
    cmd = "export DEBIAN_FRONTEND=noninteractive; apt-get update; apt-get install -y mysql-client"
    kubectl.launch(
        f"exec -n {kubectl.namespace} {too_many_queries_pod} -c clickhouse-pod -- bash -c  \"{cmd}\"",
        ok_to_fail=True,
    )

    def make_too_many_queries():
        long_cmd = ""
        for _ in range(90):
            port = random.choice(["8123", "3306", "9000"])
            if port == "9000":
                long_cmd += 'clickhouse-client -q "SELECT sleepEachRow(1),now() FROM numbers(60)";'
            if port == "3306":
                long_cmd += 'mysql -h 127.0.0.1 -P 3306 -u default -e "SELECT sleepEachRow(1),now() FROM numbers(60)";'
            if port == "8123":
                long_cmd += 'wget -qO- "http://127.0.0.1:8123?query=SELECT sleepEachRow(1),now() FROM numbers(60)";'

        long_cmd = f"echo '{long_cmd}' | xargs --verbose -i'{{}}' --no-run-if-empty -d ';' -P 100 bash -c '{{}}' 1>/dev/null"
        with open("/tmp/long_cmd.sh", "w") as f:
            f.write(long_cmd)

        kubectl.launch(
            f"cp /tmp/long_cmd.sh {too_many_queries_pod}:/tmp/long_cmd.sh -c clickhouse-pod"
        )
        kubectl.launch(
            f"exec -n {kubectl.namespace} {too_many_queries_pod} -c clickhouse-pod -- bash /tmp/long_cmd.sh",
            timeout=90,
        )

    with Then("check ClickHouseTooManyRunningQueries firing"):
        fired = alerts.wait_alert_state(
            "ClickHouseTooManyRunningQueries", "firing", True, labels={"hostname": too_many_queries_svc},
            callback=make_too_many_queries, time_range="30s"
        )
        assert fired, error("can't get ClickHouseTooManyConnections alert in firing state")

    with Then("check ClickHouseTooManyConnections gone away"):
        resolved = alerts.wait_alert_state("ClickHouseTooManyRunningQueries", "firing", False, labels={"hostname": too_many_queries_svc},
                                           sleep_time=settings.prometheus_scrape_interval)
        assert resolved, error("can't check ClickHouseTooManyConnections alert is gone away")
def test_replicas_max_absolute_delay(self, prometheus_operator_spec, clickhouse_operator_spec, chi):
    stop_replica_pod, stop_replica_svc, insert_pod, insert_svc = alerts.random_pod_choice_for_callbacks(chi)
    clickhouse.create_table_on_cluster(
        chi,
        'all-replicated', 'default.test_repl',
        '(event_time DateTime, test UInt64) ' +
        'ENGINE ReplicatedMergeTree(\'/clickhouse/tables/{installation}-{shard}/test_repl\', \'{replica}\') ORDER BY tuple()'
    )
    prometheus_scrape_interval = 15

    def restart_clickhouse_and_insert_to_replicated_table():
        with When(f"stop replica fetches on {stop_replica_svc}"):
            sql = "SYSTEM STOP FETCHES default.test_repl"
            kubectl.launch(
                f"exec -n {kubectl.namespace} {stop_replica_pod} -c clickhouse-pod -- clickhouse-client -q \"{sql}\"",
                ok_to_fail=True, timeout=600,
            )
            sql = "INSERT INTO default.test_repl SELECT now(), number FROM numbers(100000)"
            kubectl.launch(
                f"exec -n {kubectl.namespace} {insert_pod} -c clickhouse-pod -- clickhouse-client -q \"{sql}\"",
            )

    with Then("check ClickHouseReplicasMaxAbsoluteDelay firing"):
        fired = alerts.wait_alert_state(
            "ClickHouseReplicasMaxAbsoluteDelay", "firing", True, labels={"hostname": stop_replica_svc},
            time_range='60s', sleep_time=prometheus_scrape_interval * 2,
            callback=restart_clickhouse_and_insert_to_replicated_table
        )
        assert fired, error("can't get ClickHouseReadonlyReplica alert in firing state")

    clickhouse.query(
        chi["metadata"]["name"],
        "SYSTEM START FETCHES; SYSTEM RESTART REPLICAS; SYSTEM SYNC REPLICA default.test_repl",
        host=stop_replica_svc, timeout=240
    )
    with Then("check ClickHouseReplicasMaxAbsoluteDelay gone away"):
        resolved = alerts.wait_alert_state("ClickHouseReplicasMaxAbsoluteDelay", "firing", False, labels={"hostname": stop_replica_svc})
        assert resolved, error("can't check ClickHouseReplicasMaxAbsoluteDelay alert is gone away")

    clickhouse.drop_table_on_cluster(chi, 'all-replicated', 'default.test_repl')
def test_backup_is_success(self, chi, minio_spec):
    _, _, backup_pod, _ = alerts.random_pod_choice_for_callbacks(chi)
    backup_name = prepare_table_for_backup(backup_pod, chi)
    wait_backup_pod_ready_and_curl_installed(backup_pod)

    with When('Backup is success'):
        backup_successful_before = get_backup_metric_value(
            backup_pod,
            'clickhouse_backup_successful_backups|clickhouse_backup_successful_creates'
        )
        list_before = exec_on_backup_container(
            backup_pod, 'curl -sL http://127.0.0.1:7171/backup/list')
        exec_on_backup_container(
            backup_pod,
            f'curl -X POST -sL "http://127.0.0.1:7171/backup/create?name={backup_name}"'
        )
        wait_backup_command_status(backup_pod,
                                   f'create {backup_name}',
                                   expected_status='success')

        exec_on_backup_container(
            backup_pod,
            f'curl -X POST -sL "http://127.0.0.1:7171/backup/upload/{backup_name}"'
        )
        wait_backup_command_status(backup_pod,
                                   f'upload {backup_name}',
                                   expected_status='success')

    with Then('list of backups shall changed'):
        list_after = exec_on_backup_container(
            backup_pod, 'curl -sL http://127.0.0.1:7171/backup/list')
        assert list_before != list_after, error("backup is not created")

    with Then('successful backup count shall increased'):
        backup_successful_after = get_backup_metric_value(
            backup_pod,
            'clickhouse_backup_successful_backups|clickhouse_backup_successful_creates'
        )
        assert backup_successful_before != backup_successful_after, error(
            "clickhouse_backup_successful_backups shall increased")
def test_query_preempted(self, prometheus_operator_spec, clickhouse_operator_spec, chi):
    priority_pod, priority_svc, _, _ = alerts.random_pod_choice_for_callbacks(chi)

    def run_queries_with_priority():
        sql = ""
        for i in range(50):
            sql += f"SET priority={i % 20};SELECT uniq(number) FROM numbers(20000000):"
        cmd = f"echo \\\"{sql} SELECT 1\\\" | xargs -i'{{}}' --no-run-if-empty -d ':' -P 20 clickhouse-client --time -m -n -q \\\"{{}}\\\""
        kubectl.launch(f"exec {priority_pod} -- bash -c \"{cmd}\"", timeout=120)
        clickhouse.query(
            chi["metadata"]["name"],
            "SELECT event_time, CurrentMetric_QueryPreempted FROM system.metric_log WHERE CurrentMetric_QueryPreempted > 0",
            host=priority_svc,
        )

    with Then("check ClickHouseQueryPreempted firing"):
        fired = alerts.wait_alert_state("ClickHouseQueryPreempted", "firing", True, labels={"hostname": priority_svc},
                                        time_range='30s', sleep_time=settings.prometheus_scrape_interval, callback=run_queries_with_priority)
        assert fired, error("can't get ClickHouseQueryPreempted alert in firing state")
    with Then("check ClickHouseQueryPreempted gone away"):
        resolved = alerts.wait_alert_state("ClickHouseQueryPreempted", "firing", False, labels={"hostname": priority_svc})
        assert resolved, error("can't check ClickHouseQueryPreempted alert is gone away")
def test_distributed_connection_exceptions(self, prometheus_operator_spec, clickhouse_operator_spec, chi):
    delayed_pod, delayed_svc, restarted_pod, restarted_svc = alerts.random_pod_choice_for_callbacks(chi)
    clickhouse.create_distributed_table_on_cluster(chi)

    def reboot_clickhouse_and_distributed_exection():
        # we need 70 delayed files for catch
        insert_sql = 'INSERT INTO default.test_distr(event_time, test) SELECT now(), number FROM system.numbers LIMIT 10000'
        select_sql = 'SELECT count() FROM default.test_distr'
        with Then("reboot clickhouse-server pod"):
            kubectl.launch(
                f"exec -n {kubectl.namespace} {restarted_pod} -c clickhouse-pod -- kill 1",
                ok_to_fail=True,
            )
            with Then("Insert to distributed table"):
                clickhouse.query(chi["metadata"]["name"], insert_sql, host=delayed_pod, ns=kubectl.namespace)

            with Then("Select from distributed table"):
                clickhouse.query_with_error(chi["metadata"]["name"], select_sql, host=delayed_pod,
                                            ns=kubectl.namespace)

    with When("check ClickHouseDistributedConnectionExceptions firing"):
        fired = alerts.wait_alert_state(
            "ClickHouseDistributedConnectionExceptions", "firing", True,
            labels={"hostname": delayed_svc, "chi": chi["metadata"]["name"]}, time_range='30s',
            callback=reboot_clickhouse_and_distributed_exection,
        )
        assert fired, error("can't get ClickHouseDistributedConnectionExceptions alert in firing state")

    with Then("check DistributedConnectionExpections gone away"):
        resolved = alerts.wait_alert_state(
            "ClickHouseDistributedConnectionExceptions", "firing", False,
            labels={"hostname": delayed_svc}
        )
        assert resolved, error("can't check ClickHouseDistributedConnectionExceptions alert is gone away")
    kubectl.wait_pod_status(restarted_pod, "Running", ns=kubectl.namespace)
    kubectl.wait_jsonpath("pod", restarted_pod, "{.status.containerStatuses[0].ready}", "true",
                          ns=kubectl.namespace)
    clickhouse.drop_distributed_table_on_cluster(chi)
def test_insert_related_alerts(self, prometheus_operator_spec, clickhouse_operator_spec, chi):
    clickhouse.create_table_on_cluster(chi)
    delayed_pod, delayed_svc, rejected_pod, rejected_svc = alerts.random_pod_choice_for_callbacks(chi)

    prometheus_scrape_interval = settings.prometheus_scrape_interval
    # default values in system.merge_tree_settings
    parts_to_throw_insert = 300
    parts_to_delay_insert = 150
    chi_name = chi["metadata"]["name"]

    parts_limits = parts_to_delay_insert
    selected_svc = delayed_svc

    def insert_many_parts_to_clickhouse():
        stop_merges = "SYSTEM STOP MERGES default.test;"
        min_block = "SET max_block_size=1; SET max_insert_block_size=1; SET min_insert_block_size_rows=1;"
        with When(f"Insert to MergeTree table {parts_limits} parts"):
            r = parts_limits
            sql = stop_merges + min_block + f"INSERT INTO default.test(event_time, test) SELECT now(),number FROM system.numbers LIMIT {r};"
            clickhouse.query(chi_name, sql, host=selected_svc, ns=kubectl.namespace)

            sql = min_block + "INSERT INTO default.test(event_time, test) SELECT now(), number FROM system.numbers LIMIT 1;"
            clickhouse.query_with_error(chi_name, sql, host=selected_svc, ns=kubectl.namespace)
            with Then(f"wait prometheus_scrape_interval={prometheus_scrape_interval}*2 sec"):
                time.sleep(prometheus_scrape_interval * 2)

            with Then("after 21.8 InsertedRows include system.* rows"):
                for i in range(35):
                    sql = min_block + "INSERT INTO default.test(event_time, test) SELECT now(), number FROM system.numbers LIMIT 1;"
                    clickhouse.query_with_error(chi_name, sql, host=selected_svc, ns=kubectl.namespace)

    insert_many_parts_to_clickhouse()
    with Then("check ClickHouseDelayedInsertThrottling firing"):
        fired = alerts.wait_alert_state(
            "ClickHouseDelayedInsertThrottling", "firing", True, labels={"hostname": delayed_svc}, time_range="60s"
        )
        assert fired, error("can't get ClickHouseDelayedInsertThrottling alert in firing state")
    with Then("check ClickHouseMaxPartCountForPartition firing"):
        fired = alerts.wait_alert_state(
            "ClickHouseMaxPartCountForPartition", "firing", True, labels={"hostname": delayed_svc}, time_range="90s"
        )
        assert fired, error("can't get ClickHouseMaxPartCountForPartition alert in firing state")
    with Then("check ClickHouseLowInsertedRowsPerQuery firing"):
        fired = alerts.wait_alert_state(
            "ClickHouseLowInsertedRowsPerQuery", "firing", True, labels={"hostname": delayed_svc}, time_range="120s",
        )
        assert fired, error("can't get ClickHouseLowInsertedRowsPerQuery alert in firing state")

    clickhouse.query(chi_name, "SYSTEM START MERGES default.test", host=selected_svc, ns=kubectl.namespace)

    with Then("check ClickHouseDelayedInsertThrottling gone away"):
        resolved = alerts.wait_alert_state("ClickHouseDelayedInsertThrottling", "firing", False, labels={"hostname": delayed_svc})
        assert resolved, error("can't check ClickHouseDelayedInsertThrottling alert is gone away")
    with Then("check ClickHouseMaxPartCountForPartition gone away"):
        resolved = alerts.wait_alert_state("ClickHouseMaxPartCountForPartition", "firing", False, labels={"hostname": delayed_svc})
        assert resolved, error("can't check ClickHouseMaxPartCountForPartition alert is gone away")
    with Then("check ClickHouseLowInsertedRowsPerQuery gone away"):
        resolved = alerts.wait_alert_state("ClickHouseLowInsertedRowsPerQuery", "firing", False, labels={"hostname": delayed_svc})
        assert resolved, error("can't check ClickHouseLowInsertedRowsPerQuery alert is gone away")

    parts_limits = parts_to_throw_insert
    selected_svc = rejected_svc
    insert_many_parts_to_clickhouse()
    with Then("check ClickHouseRejectedInsert firing"):
        fired = alerts.wait_alert_state("ClickHouseRejectedInsert", "firing", True, labels={"hostname": rejected_svc}, time_range="30s",
                                        sleep_time=settings.prometheus_scrape_interval)
        assert fired, error("can't get ClickHouseRejectedInsert alert in firing state")

    with Then("check ClickHouseRejectedInsert gone away"):
        resolved = alerts.wait_alert_state("ClickHouseRejectedInsert", "firing", False, labels={"hostname": rejected_svc})
        assert resolved, error("can't check ClickHouseRejectedInsert alert is gone away")

    clickhouse.query(chi_name, "SYSTEM START MERGES default.test", host=selected_svc, ns=kubectl.namespace)
    clickhouse.drop_table_on_cluster(chi)
def test_backup_duration(self, chi, minio_spec):
    short_pod, _, long_pod, _ = alerts.random_pod_choice_for_callbacks(chi)
    apply_fake_backup("prepare fake backup duration metric")

    for pod in [short_pod, long_pod]:
        with Then(f"wait {pod} ready"):
            kubectl.wait_field("pod", pod, ".spec.containers[1].image",
                               "nginx:latest")
            kubectl.wait_field("pod", pod,
                               ".status.containerStatuses[1].ready", "true")

            fired = alerts.wait_alert_state(
                "ClickHouseBackupTooLong",
                "firing",
                expected_state=True,
                sleep_time=settings.prometheus_scrape_interval,
                labels={"pod_name": pod},
                time_range='60s')
            assert fired, error(
                f"can't get ClickHouseBackupTooLong alert in firing state for {pod}"
            )

    with Then(f"wait when prometheus will scrape fake data"):
        time.sleep(70)

    with Then(f"decrease {short_pod} backup duration"):
        kubectl.launch(
            f'exec {short_pod} -c clickhouse-backup -- bash -xc \''
            'echo "# HELP clickhouse_backup_last_create_duration Backup create duration in nanoseconds" > /usr/share/nginx/html/metrics && '
            'echo "# TYPE clickhouse_backup_last_create_duration gauge" >> /usr/share/nginx/html/metrics && '
            'echo "clickhouse_backup_last_create_duration 7000000000000" >> /usr/share/nginx/html/metrics && '
            'echo "# HELP clickhouse_backup_last_create_status Last backup create status: 0=failed, 1=success, 2=unknown" >> /usr/share/nginx/html/metrics && '
            'echo "# TYPE clickhouse_backup_last_create_status gauge" >> /usr/share/nginx/html/metrics && '
            'echo "clickhouse_backup_last_create_status 1" >> /usr/share/nginx/html/metrics'
            '\'')

        fired = alerts.wait_alert_state(
            "ClickHouseBackupTooShort",
            "firing",
            expected_state=True,
            sleep_time=settings.prometheus_scrape_interval,
            labels={"pod_name": short_pod},
            time_range='60s')
        assert fired, error(
            "can't get ClickHouseBackupTooShort alert in firing state")

    apply_normal_backup()

    with Then("check ClickHouseBackupTooShort gone away"):
        resolved = alerts.wait_alert_state("ClickHouseBackupTooShort",
                                           "firing",
                                           expected_state=False,
                                           labels={"pod_name": short_pod})
        assert resolved, error(
            "can't get ClickHouseBackupTooShort alert is gone away")

    with Then("check ClickHouseBackupTooLong gone away"):
        resolved = alerts.wait_alert_state("ClickHouseBackupTooLong",
                                           "firing",
                                           expected_state=False,
                                           labels={"pod_name": long_pod})
        assert resolved, error(
            "can't get ClickHouseBackupTooLong alert is gone away")