def test_read_only_replica():
    read_only_pod, read_only_svc, other_pod, other_svc = random_pod_choice_for_callbacks()
    chi_name = chi["metadata"]["name"]
    create_table_on_cluster('all-replicated', 'default.test_repl',
                            '(event_time DateTime, test UInt64) ENGINE ReplicatedMergeTree(\'/clickhouse/tables/{installation}-{shard}/{uuid}/test_repl\', \'{replica}\') ORDER BY tuple()')

    def restart_zookeeper():
        kubectl.launch(
            f"exec -n {kubectl.namespace} zookeeper-0 -- sh -c \"kill 1\"",
            ok_to_fail=True,
        )
        clickhouse.query_with_error(chi_name, "INSERT INTO default.test_repl VALUES(now(),rand())", host=read_only_svc)

    with Then("check ClickHouseReadonlyReplica firing"):
        fired = wait_alert_state("ClickHouseReadonlyReplica", "firing", True, labels={"hostname": read_only_svc},
                                 time_range='30s', sleep_time=5, callback=restart_zookeeper)
        assert fired, error("can't get ClickHouseReadonlyReplica alert in firing state")
    with Then("check ClickHouseReadonlyReplica gone away"):
        resolved = wait_alert_state("ClickHouseReadonlyReplica", "firing", False, labels={"hostname": read_only_svc})
        assert resolved, error("can't check ClickHouseReadonlyReplica alert is gone away")

    kubectl.wait_pod_status("zookeeper-0", "Running", ns=kubectl.namespace)
    kubectl.wait_jsonpath("pod", "zookeeper-0", "{.status.containerStatuses[0].ready}", "true",
                          ns=kubectl.namespace)

    clickhouse.query_with_error(
        chi_name, "SYSTEM RESTART REPLICAS; SYSTEM SYNC REPLICA default.test_repl",
        host=read_only_svc, timeout=240
    )
    clickhouse.query_with_error(
        chi_name, "SYSTEM RESTART REPLICAS; SYSTEM SYNC REPLICA default.test_repl",
        host=other_svc, timeout=240
    )

    drop_table_on_cluster('all-replicated', 'default.test_repl')
def test_zookeeper_hardware_exceptions():
    pod1, svc1, pod2, svc2 = random_pod_choice_for_callbacks()
    chi_name = chi["metadata"]["name"]

    def restart_zookeeper():
        kubectl.launch(
            f"exec -n {kubectl.namespace} zookeeper-0 -- sh -c \"kill 1\"",
            ok_to_fail=True,
        )
        clickhouse.query_with_error(chi_name, "SELECT name, path FROM system.zookeeper WHERE path='/'", host=svc1)
        clickhouse.query_with_error(chi_name, "SELECT name, path FROM system.zookeeper WHERE path='/'", host=svc2)

    with Then("check ClickHouseZooKeeperHardwareExceptions firing"):
        for svc in (svc1, svc2):
            fired = wait_alert_state("ClickHouseZooKeeperHardwareExceptions", "firing", True, labels={"hostname": svc},
                                     time_range='40s', sleep_time=5, callback=restart_zookeeper)
            assert fired, error("can't get ClickHouseZooKeeperHardwareExceptions alert in firing state")

    kubectl.wait_pod_status("zookeeper-0", "Running", ns=kubectl.namespace)
    kubectl.wait_jsonpath("pod", "zookeeper-0", "{.status.containerStatuses[0].ready}", "true",
                          ns=kubectl.namespace)

    with Then("check ClickHouseZooKeeperHardwareExceptions gone away"):
        for svc in (svc1, svc2):
            resolved = wait_alert_state("ClickHouseZooKeeperHardwareExceptions", "firing", False, labels={"hostname": svc})
            assert resolved, error("can't check ClickHouseZooKeeperHardwareExceptions alert is gone away")
Exemplo n.º 3
0
def require_zookeeper(manifest='zookeeper-1-node-1GB-for-tests-only.yaml', force_install=False):
    with Given("Install Zookeeper if missing"):
        if force_install or kubectl.get_count("service", name="zookeeper") == 0:
            config = util.get_full_path(f"../deploy/zookeeper/quick-start-persistent-volume/{manifest}")
            kubectl.apply(config)
            kubectl.wait_object("pod", "zookeeper-0")
            kubectl.wait_pod_status("zookeeper-0", "Running")
def test_distributed_connection_exceptions():
    delayed_pod, delayed_svc, restarted_pod, restarted_svc = random_pod_choice_for_callbacks()
    create_distributed_table_on_cluster()

    def reboot_clickhouse_and_distributed_exection():
        # we need 70 delayed files for catch
        insert_sql = 'INSERT INTO default.test_distr(event_time, test) SELECT now(), number FROM system.numbers LIMIT 10000'
        select_sql = 'SELECT count() FROM default.test_distr'
        with Then("reboot clickhouse-server pod"):
            kubectl.launch(
                f"exec -n {kubectl.namespace} {restarted_pod} -c clickhouse -- kill 1",
                ok_to_fail=True,
            )
            with And("Insert to distributed table"):
                clickhouse.query(chi["metadata"]["name"], insert_sql, host=delayed_pod, ns=kubectl.namespace)

            with And("Select from distributed table"):
                clickhouse.query_with_error(chi["metadata"]["name"], select_sql, host=delayed_pod,
                                            ns=kubectl.namespace)

    with When("check ClickHouseDistributedConnectionExceptions firing"):
        fired = wait_alert_state("ClickHouseDistributedConnectionExceptions", "firing", True,
                                 labels={"hostname": delayed_svc, "chi": chi["metadata"]["name"]}, time_range='30s',
                                 callback=reboot_clickhouse_and_distributed_exection)
        assert fired, error("can't get ClickHouseDistributedConnectionExceptions alert in firing state")

    with Then("check DistributedConnectionExpections gone away"):
        resolved = wait_alert_state("ClickHouseDistributedConnectionExceptions", "firing", False,
                                    labels={"hostname": delayed_svc})
        assert resolved, error("can't check ClickHouseDistributedConnectionExceptions alert is gone away")
    kubectl.wait_pod_status(restarted_pod, "Running", ns=kubectl.namespace)
    kubectl.wait_jsonpath("pod", restarted_pod, "{.status.containerStatuses[0].ready}", "true",
                          ns=kubectl.namespace)
    drop_distributed_table_on_cluster()
 def wait_when_zookeeper_up():
     kubectl.wait_pod_status(zookeeper_pod, "Running", ns=kubectl.namespace)
     kubectl.wait_jsonpath("pod",
                           zookeeper_pod,
                           "{.status.containerStatuses[0].ready}",
                           "true",
                           ns=kubectl.namespace)
Exemplo n.º 6
0
def restart_operator(ns=settings.operator_namespace, timeout=60):
    pod_name = kubectl.get(
        "pod", name="", ns=ns,
        label="-l app=clickhouse-operator")["items"][0]["metadata"]["name"]
    kubectl.launch(f"delete pod {pod_name}", ns=ns, timeout=timeout)
    kubectl.wait_object("pod",
                        name="",
                        ns=ns,
                        label="-l app=clickhouse-operator")
    pod_name = kubectl.get(
        "pod", name="", ns=ns,
        label="-l app=clickhouse-operator")["items"][0]["metadata"]["name"]
    kubectl.wait_pod_status(pod_name, "Running", ns=ns)
def test_distributed_files_to_insert():
    delayed_pod, delayed_svc, restarted_pod, restarted_svc = random_pod_choice_for_callbacks()
    create_distributed_table_on_cluster()

    insert_sql = 'INSERT INTO default.test_distr(event_time, test) SELECT now(), number FROM system.numbers LIMIT 1000'
    clickhouse.query(
        chi["metadata"]["name"], 'SYSTEM STOP DISTRIBUTED SENDS default.test_distr',
        pod=delayed_pod, ns=kubectl.namespace
    )

    files_to_insert_from_metrics = 0
    files_to_insert_from_disk = 0
    tries = 0
    # we need more than 50 delayed files for catch
    while files_to_insert_from_disk <= 55 and files_to_insert_from_metrics <= 55 and tries < 500:
        kubectl.launch(
            f"exec -n {kubectl.namespace} {restarted_pod} -c clickhouse -- kill 1",
            ok_to_fail=True,
        )
        clickhouse.query(chi["metadata"]["name"], insert_sql, pod=delayed_pod, host=delayed_pod, ns=kubectl.namespace)
        files_to_insert_from_metrics = clickhouse.query(
            chi["metadata"]["name"], "SELECT value FROM system.metrics WHERE metric='DistributedFilesToInsert'",
            pod=delayed_pod, ns=kubectl.namespace
        )
        files_to_insert_from_metrics = int(files_to_insert_from_metrics)

        files_to_insert_from_disk = int(kubectl.launch(
            f"exec -n {kubectl.namespace} {delayed_pod} -c clickhouse -- bash -c 'ls -la /var/lib/clickhouse/data/default/test_distr/*/*.bin 2>/dev/null | wc -l'",
            ok_to_fail=False,
        ))

    with When("reboot clickhouse-server pod"):
        fired = wait_alert_state("ClickHouseDistributedFilesToInsertHigh", "firing", True,
                                 labels={"hostname": delayed_svc, "chi": chi["metadata"]["name"]})
        assert fired, error("can't get ClickHouseDistributedFilesToInsertHigh alert in firing state")

    kubectl.wait_pod_status(restarted_pod, "Running", ns=kubectl.namespace)

    clickhouse.query(
        chi["metadata"]["name"], 'SYSTEM START DISTRIBUTED SENDS default.test_distr',
        pod=delayed_pod, ns=kubectl.namespace
    )

    with Then("check ClickHouseDistributedFilesToInsertHigh gone away"):
        resolved = wait_alert_state("ClickHouseDistributedFilesToInsertHigh", "firing", False, labels={"hostname": delayed_svc})
        assert resolved, error("can't check ClickHouseDistributedFilesToInsertHigh alert is gone away")

    drop_distributed_table_on_cluster()
Exemplo n.º 8
0
def test_zookeeper_rescale(self):
    with When('create replicated table'):
        clickhouse.create_table_on_cluster(
            chi, 'all-sharded', 'default.zk_repl',
            '(id UInt64) ENGINE=ReplicatedMergeTree(\'/clickhouse/tables/default.zk_repl/{shard}\',\'{replica}\') ORDER BY (id)'
        )
    with Then('insert data x1'):
        clickhouse.query(
            chi['metadata']['name'],
            'INSERT INTO default.zk_repl SELECT number FROM numbers(1000)',
            pod="chi-test-cluster-for-zk-default-0-0-0")
    with Then('scale up zookeeper to 3 nodes'):
        util.require_zookeeper('zookeeper-3-nodes-1GB-for-tests-only.yaml',
                               force_install=True)
        kubectl.wait_pod_status('zookeeper-0', 'Running',
                                settings.test_namespace)
        kubectl.wait_pod_status('zookeeper-1', 'Running',
                                settings.test_namespace)
        kubectl.wait_pod_status('zookeeper-2', 'Running',
                                settings.test_namespace)

    with Then('insert data x2'):
        clickhouse.query(
            chi['metadata']['name'],
            'INSERT INTO default.zk_repl SELECT number*2 FROM numbers(1000)',
            pod="chi-test-cluster-for-zk-default-0-1-0")

    with Then('scale down zookeeper to 1 nodes'):
        util.require_zookeeper('zookeeper-1-node-1GB-for-tests-only.yaml',
                               force_install=True)
        kubectl.wait_pod_status('zookeeper-0', 'Running',
                                settings.test_namespace)

    with Then('insert data x2'):
        clickhouse.query(
            chi['metadata']['name'],
            'INSERT INTO default.zk_repl SELECT number*3 FROM numbers(1000)',
            pod="chi-test-cluster-for-zk-default-0-0-0")

    assert clickhouse.query(chi['metadata']['name'],
                            'SELECT count() FROM default.zk_repl',
                            pod="chi-test-cluster-for-zk-default-0-1-0"
                            ) == '3000', "Invalid rows after 3x1000 inserts"

    clickhouse.drop_table_on_cluster(chi, 'all-sharded', 'default.zk_repl')
Exemplo n.º 9
0
def test_014():
    require_zookeeper()

    create_table = """
    CREATE TABLE test_local(a Int8) 
    Engine = ReplicatedMergeTree('/clickhouse/{installation}/{cluster}/tables/{shard}/{database}/{table}', '{replica}')
    PARTITION BY tuple() 
    ORDER BY a
    """.replace('\r', '').replace('\n', '')

    config = "configs/test-014-replication-1.yaml"
    chi = manifest.get_chi_name(util.get_full_path(config))
    cluster = "default"

    kubectl.create_and_check(
        config=config,
        check={
            "apply_templates": {
                settings.clickhouse_template,
                "templates/tpl-persistent-volume-100Mi.yaml",
            },
            "object_counts": {
                "statefulset": 2,
                "pod": 2,
                "service": 3,
            },
            "do_not_delete": 1,
        })

    start_time = kubectl.get_field("pod", f"chi-{chi}-{cluster}-0-0-0",
                                   ".status.startTime")

    schema_objects = ['test_local', 'test_view', 'test_mv', 'a_view']
    with Given("Create schema objects"):
        clickhouse.query(chi, create_table, host=f"chi-{chi}-{cluster}-0-0")
        clickhouse.query(chi,
                         "CREATE VIEW test_view as SELECT * from test_local",
                         host=f"chi-{chi}-{cluster}-0-0")
        clickhouse.query(chi,
                         "CREATE VIEW a_view as SELECT * from test_view",
                         host=f"chi-{chi}-{cluster}-0-0")
        clickhouse.query(
            chi,
            "CREATE MATERIALIZED VIEW test_mv Engine = Log as SELECT * from test_local",
            host=f"chi-{chi}-{cluster}-0-0")
        clickhouse.query(
            chi,
            "CREATE DICTIONARY test_dict (a Int8, b Int8) PRIMARY KEY a SOURCE(CLICKHOUSE(host 'localhost' port 9000 table 'test_local' user 'default')) LAYOUT(FLAT()) LIFETIME(0)",
            host=f"chi-{chi}-{cluster}-0-0")

    with Given(
            "Replicated table is created on a first replica and data is inserted"
    ):
        clickhouse.query(chi,
                         "INSERT INTO test_local values(1)",
                         host=f"chi-{chi}-{cluster}-0-0")
        with When("Table is created on the second replica"):
            clickhouse.query(chi,
                             create_table,
                             host=f"chi-{chi}-{cluster}-0-1")
            # Give some time for replication to catch up
            time.sleep(10)
            with Then("Data should be replicated"):
                out = clickhouse.query(chi,
                                       "SELECT a FROM test_local",
                                       host=f"chi-{chi}-{cluster}-0-1")
                assert out == "1"

    with When("Add one more replica"):
        kubectl.create_and_check(config="configs/test-014-replication-2.yaml",
                                 check={
                                     "pod_count": 3,
                                     "do_not_delete": 1,
                                 })
        # Give some time for replication to catch up
        time.sleep(10)

        new_start_time = kubectl.get_field("pod", f"chi-{chi}-{cluster}-0-0-0",
                                           ".status.startTime")
        assert start_time == new_start_time

        with Then("Schema objects should be migrated to the new replica"):
            for obj in schema_objects:
                out = clickhouse.query(
                    chi,
                    f"SELECT count() FROM system.tables WHERE name = '{obj}'",
                    host=f"chi-{chi}-{cluster}-0-2")
                assert out == "1"
            # Check dictionary
            out = clickhouse.query(
                chi,
                f"SELECT count() FROM system.dictionaries WHERE name = 'test_dict'",
                host=f"chi-{chi}-{cluster}-0-2")
            assert out == "1"

        with And("Replicated table should have the data"):
            out = clickhouse.query(chi,
                                   "SELECT a FROM test_local",
                                   host=f"chi-{chi}-{cluster}-0-2")
            assert out == "1"

    with When("Remove replica"):
        kubectl.create_and_check(config=config,
                                 check={
                                     "pod_count": 1,
                                     "do_not_delete": 1,
                                 })

        new_start_time = kubectl.get_field("pod", f"chi-{chi}-{cluster}-0-0-0",
                                           ".status.startTime")
        assert start_time == new_start_time

        with Then("Replica needs to be removed from the Zookeeper as well"):
            out = clickhouse.query(
                chi,
                "SELECT count() FROM system.replicas WHERE table='test_local'")
            assert out == "1"

    with When("Restart Zookeeper pod"):
        with Then("Delete Zookeeper pod"):
            kubectl.launch("delete pod zookeeper-0")
            time.sleep(1)

        with Then(
                "Insert into the table while there is no Zookeeper -- table should be in readonly mode"
        ):
            out = clickhouse.query_with_error(
                chi, "INSERT INTO test_local values(2)")
            assert "Table is in readonly mode" in out

        with Then("Wait for Zookeeper pod to come back"):
            kubectl.wait_object("pod", "zookeeper-0")
            kubectl.wait_pod_status("zookeeper-0", "Running")

        with Then(
                "Wait for ClickHouse to reconnect to Zookeeper and switch to read-write mode"
        ):
            time.sleep(30)
        # with Then("Restart clickhouse pods"):
        #    kubectl("delete pod chi-test-014-replication-default-0-0-0")
        #    kubectl("delete pod chi-test-014-replication-default-0-1-0")

        with Then("Table should be back to normal"):
            clickhouse.query(chi, "INSERT INTO test_local values(3)")

    kubectl.delete_chi("test-014-replication")
def test_read_only_replica(self):
    read_only_pod, read_only_svc, other_pod, other_svc = alerts.random_pod_choice_for_callbacks(
        chi)
    chi_name = chi["metadata"]["name"]
    clickhouse.create_table_on_cluster(
        chi, 'all-replicated', 'default.test_repl',
        '(event_time DateTime, test UInt64) ' +
        'ENGINE ReplicatedMergeTree(\'/clickhouse/tables/{installation}-{shard}/test_repl\', \'{replica}\') ORDER BY tuple()'
    )

    def restart_zookeeper():
        kubectl.launch(
            f"exec -n {kubectl.namespace} zookeeper-0 -- sh -c \"kill 1\"",
            ok_to_fail=True,
        )
        clickhouse.query_with_error(
            chi_name,
            "INSERT INTO default.test_repl VALUES(now(),rand())",
            host=read_only_svc)

    with Then("check ClickHouseReadonlyReplica firing"):
        fired = alerts.wait_alert_state(
            "ClickHouseReadonlyReplica",
            "firing",
            True,
            labels={"hostname": read_only_svc},
            time_range='30s',
            sleep_time=settings.prometheus_scrape_interval,
            callback=restart_zookeeper)
        assert fired, error(
            "can't get ClickHouseReadonlyReplica alert in firing state")
    with Then("check ClickHouseReadonlyReplica gone away"):
        resolved = alerts.wait_alert_state("ClickHouseReadonlyReplica",
                                           "firing",
                                           False,
                                           labels={"hostname": read_only_svc})
        assert resolved, error(
            "can't check ClickHouseReadonlyReplica alert is gone away")

    kubectl.wait_pod_status("zookeeper-0", "Running", ns=kubectl.namespace)
    kubectl.wait_jsonpath("pod",
                          "zookeeper-0",
                          "{.status.containerStatuses[0].ready}",
                          "true",
                          ns=kubectl.namespace)

    for i in range(11):
        zookeeper_status = kubectl.launch(
            f"exec -n {kubectl.namespace} zookeeper-0 -- sh -c \"echo ruok | nc 127.0.0.1 2181\"",
            ok_to_fail=True)
        if "imok" in zookeeper_status:
            break
        elif i == 10:
            fail(f"invalid zookeeper status after {i} retries")
        with Then("zookeper is not ready, wait 2 seconds"):
            time.sleep(2)

    clickhouse.query_with_error(
        chi_name,
        "SYSTEM RESTART REPLICAS; SYSTEM SYNC REPLICA default.test_repl",
        host=read_only_svc,
        timeout=240)
    clickhouse.query_with_error(
        chi_name,
        "SYSTEM RESTART REPLICAS; SYSTEM SYNC REPLICA default.test_repl",
        host=other_svc,
        timeout=240)

    clickhouse.drop_table_on_cluster(chi, 'all-replicated',
                                     'default.test_repl')