def get_prometheus_and_alertmanager_spec(): with Given("get information about prometheus installation"): prometheus_operator_spec = kubectl.get( "pod", ns=settings.prometheus_namespace, name="", label= "-l app.kubernetes.io/component=controller,app.kubernetes.io/name=prometheus-operator" ) alertmanager_spec = kubectl.get( "pod", ns=settings.prometheus_namespace, name="", label="-l app=alertmanager,alertmanager=alertmanager") prometheus_spec = kubectl.get( "pod", ns=settings.prometheus_namespace, name="", label="-l app=prometheus,prometheus=prometheus") if not ("items" in prometheus_spec and len(prometheus_spec["items"]) and "metadata" in prometheus_spec["items"][0]): fail("invalid prometheus_spec, please run create-prometheus.sh") return prometheus_operator_spec, prometheus_spec, alertmanager_spec
def check_alert_state(alert_name, prometheus_pod, alert_state="firing", labels=None, time_range="10s"): with Then( f"check {alert_name} for state {alert_state} and {labels} labels in {time_range}" ): cmd = f"exec -n {settings.prometheus_namespace} {prometheus_pod} -c prometheus -- " cmd += "wget -qO- 'http://127.0.0.1:9090/api/v1/query?query=ALERTS{" if labels is None: labels = {} if not isinstance(labels, dict): fail(f"Invalid labels={labels}") labels.update({"alertname": alert_name, "alertstate": alert_state}) cmd += ",".join( [f"{name}=\"{value}\"" for name, value in labels.items()]) cmd += f"}}[{time_range}]' 2>/dev/null" out = kubectl.launch(cmd) out = json.loads(out) if not ("status" in out and out["status"] == "success"): fail("wrong response from prometheus query API") if len(out["data"]["result"]) == 0: with Then("not present, empty result"): return False result_labels = out["data"]["result"][0]["metric"].items() exists = all(item in result_labels for item in labels.items()) with Then("got result and contains labels" if exists else "got result, but doesn't contain labels"): return exists
def set_operator_version(version, ns=settings.operator_namespace, timeout=60): operator_image = f"{settings.operator_docker_repo}:{version}" metrics_exporter_image = f"{settings.metrics_exporter_docker_repo}:{version}" kubectl.launch(f"set image deployment.v1.apps/clickhouse-operator clickhouse-operator={operator_image}", ns=ns) kubectl.launch(f"set image deployment.v1.apps/clickhouse-operator metrics-exporter={metrics_exporter_image}", ns=ns) kubectl.launch("rollout status deployment.v1.apps/clickhouse-operator", ns=ns, timeout=timeout) if kubectl.get_count("pod", ns=ns, label=operator_label) == 0: fail("invalid clickhouse-operator pod count")
def is_expected_backup_status(command_name, command_is_done, st, expected_status, err_status): if 'command' in st and st['command'] == command_name: if st['status'] == expected_status: command_is_done = True return True, command_is_done elif st['status'] == err_status: if 'error' in st: fail(st['error']) else: fail(f'unexpected status of {command_name} {st}') else: with Then('Not ready, wait 5 sec'): time.sleep(5) return False, command_is_done
def query(self, q, params=[], fetch=True): try: note(f"query: {q}") cursor = self.connection.cursor() cursor.execute(q, *params) if fetch: rows = cursor.fetchall() for row in rows: note(row) return rows except pyodbc.Error as exc: exception() fail(str(exc)) finally: if self.logs and settings.debug: # sleep 0.5 sec to let messages to be written to the logs time.sleep(0.5) self.logs.read(timeout=0.1)
def test_read_only_replica(self): read_only_pod, read_only_svc, other_pod, other_svc = alerts.random_pod_choice_for_callbacks( chi) chi_name = chi["metadata"]["name"] clickhouse.create_table_on_cluster( chi, 'all-replicated', 'default.test_repl', '(event_time DateTime, test UInt64) ' + 'ENGINE ReplicatedMergeTree(\'/clickhouse/tables/{installation}-{shard}/test_repl\', \'{replica}\') ORDER BY tuple()' ) def restart_zookeeper(): kubectl.launch( f"exec -n {kubectl.namespace} zookeeper-0 -- sh -c \"kill 1\"", ok_to_fail=True, ) clickhouse.query_with_error( chi_name, "INSERT INTO default.test_repl VALUES(now(),rand())", host=read_only_svc) with Then("check ClickHouseReadonlyReplica firing"): fired = alerts.wait_alert_state( "ClickHouseReadonlyReplica", "firing", True, labels={"hostname": read_only_svc}, time_range='30s', sleep_time=settings.prometheus_scrape_interval, callback=restart_zookeeper) assert fired, error( "can't get ClickHouseReadonlyReplica alert in firing state") with Then("check ClickHouseReadonlyReplica gone away"): resolved = alerts.wait_alert_state("ClickHouseReadonlyReplica", "firing", False, labels={"hostname": read_only_svc}) assert resolved, error( "can't check ClickHouseReadonlyReplica alert is gone away") kubectl.wait_pod_status("zookeeper-0", "Running", ns=kubectl.namespace) kubectl.wait_jsonpath("pod", "zookeeper-0", "{.status.containerStatuses[0].ready}", "true", ns=kubectl.namespace) for i in range(11): zookeeper_status = kubectl.launch( f"exec -n {kubectl.namespace} zookeeper-0 -- sh -c \"echo ruok | nc 127.0.0.1 2181\"", ok_to_fail=True) if "imok" in zookeeper_status: break elif i == 10: fail(f"invalid zookeeper status after {i} retries") with Then("zookeper is not ready, wait 2 seconds"): time.sleep(2) clickhouse.query_with_error( chi_name, "SYSTEM RESTART REPLICAS; SYSTEM SYNC REPLICA default.test_repl", host=read_only_svc, timeout=240) clickhouse.query_with_error( chi_name, "SYSTEM RESTART REPLICAS; SYSTEM SYNC REPLICA default.test_repl", host=other_svc, timeout=240) clickhouse.drop_table_on_cluster(chi, 'all-replicated', 'default.test_repl')