Exemplo n.º 1
0
def inspect_component(namespace):
    dir_name = "inspect_data/" + namespace + "-logs"
    if os.path.isdir(dir_name):
        runcommand.invoke("rm -R " + dir_name)
        logging.info("Deleted existing %s directory" % (dir_name))
    command_out = runcommand.invoke("oc adm inspect ns/" + namespace + " --dest"
                                    "-dir=" + dir_name + " | tr -d '\n'")
    logging.info(command_out)
Exemplo n.º 2
0
def slack_logging(cluster_info, iteration, watch_nodes_status, failed_nodes,
                  watch_cluster_operators_status, failed_operators,
                  watch_namespaces_status, failed_pods_components,
                  custom_checks_status, custom_checks_fail_messages):
    issues = []
    cerberus_report_path = runcommand.invoke("pwd | tr -d '\n'")
    if not watch_nodes_status:
        issues.append("*nodes: " + ", ".join(failed_nodes) + "*")
    if not watch_cluster_operators_status:
        issues.append("*cluster operators: " + ", ".join(failed_operators) +
                      "*")
    if not watch_namespaces_status:
        issues.append("*namespaces: " +
                      ", ".join(list(failed_pods_components.keys())) + "*")
    if not custom_checks_status:
        issues.append("*custom_checks: " +
                      ", ".join(custom_checks_fail_messages) + "*")
    issues = "\n".join(issues)
    post_message_in_slack(
        slack_tag + " %sIn iteration %d at %s, Cerberus "
        "found issues in: \n%s \nHence, setting the "
        "go/no-go signal to false. \nThe full report "
        "is at *%s* on the host cerberus is running." %
        (cluster_info, iteration, datetime.datetime.now().replace(
            microsecond=0).isoformat(), issues, cerberus_report_path),
        thread_ts)
Exemplo n.º 3
0
def initialize_prom_client(distribution, prometheus_url, prometheus_bearer_token):
    global prom_cli
    if distribution == "openshift" and not prometheus_url:
        url = runcommand.invoke(
            r"""oc get routes -n openshift-monitoring -o=jsonpath='{.items[?(@.metadata.name=="prometheus-k8s")].spec.host}'"""  # noqa
        )
        prometheus_url = "https://" + url
    if distribution == "openshift" and not prometheus_bearer_token:
        prometheus_bearer_token = runcommand.invoke("oc -n openshift-monitoring " "sa get-token prometheus-k8s")
    if prometheus_url and prometheus_bearer_token:
        bearer = "Bearer " + prometheus_bearer_token
        headers = {"Authorization": bearer}
        try:
            prom_cli = prometheus_api_client.PrometheusConnect(url=prometheus_url, headers=headers, disable_ssl=True)
        except Exception as e:
            logging.error("Not able to initialize the client %s" % e)
    else:
        prom_cli = None
Exemplo n.º 4
0
def check_storage():
    logging.info("Check if netapp storages are all available.")

    trident_backend_list = runcommand.invoke(
        "oc -n openshift-bcgov-trident get TridentBackends -o json")
    trident_backends = json.loads(trident_backend_list)['items']

    for storage in trident_backends:
        storage_name = storage['metadata']['name']
        logging.info("-> TridentBackends " + storage_name)

        status_output = runcommand.invoke(
            "oc -n openshift-bcgov-trident get TridentBackends " +
            storage_name + " -o json")
        status = json.loads(status_output)['state']

        if (status != "online"):
            return False

    return True
Exemplo n.º 5
0
def check_image_registry_and_routing():
    logging.info("Check Image Registry API and test on routing layer.")

    # get image_registry URL:
    image_registry_route = runcommand.invoke(
        "oc -n openshift-image-registry get route/public-registry -o json")
    image_registry_host = eval(image_registry_route)['spec']['host']
    image_registry_url = "https://" + image_registry_host + "/healthz"
    logging.info("Detected Image Registry API: " + image_registry_url)
    (resp, content) = h.request(image_registry_url, "GET")

    return resp.status == 200
def check():
    node_usage = subprocess.check_output(
        "oc adm top  nodes --no-headers | awk '{ print $5 }' | tr -d '%' | sort -rn | head -1",
        shell=True,
        universal_newlines=True)
    if int(node_usage) < 90:
        logging.info("No abnormalities found in node resource utilization")
        message = "No abnormalities found in node resource utilization\n"
        return True, message
    else:
        logging.info("Node resource utilization exceeded the limit")
        message = "Node resource utilization exceeded the limit\n"
        node_usage_details = runcommand.invoke("oc adm top nodes")
        logging.info("Output of: oc adm top nodes\n%s" % (node_usage_details))
        message = "\n Output of : oc adm top nodes:\n %s" % (
            node_usage_details)
        return False, message
Exemplo n.º 7
0
def monitor_nodes():
    notready_nodes = []
    all_node_info = runcommand.invoke("kubectl get nodes -o json")
    all_node_info = json.loads(all_node_info)
    for node_info in all_node_info["items"]:
        node = node_info["metadata"]["name"]
        node_kerneldeadlock_status = "False"
        for condition in node_info["status"]["conditions"]:
            if condition["type"] == "KernelDeadlock":
                node_kerneldeadlock_status = condition["status"]
            elif condition["type"] == "Ready":
                node_ready_status = condition["status"]
            else:
                continue
        if node_kerneldeadlock_status != "False" or node_ready_status != "True":
            notready_nodes.append(node)
    status = False if notready_nodes else True
    return status, notready_nodes
Exemplo n.º 8
0
def check_master_taint(master_nodes):
    schedulable_masters = []
    all_master_info = runcommand.invoke("kubectl get nodes " +
                                        " ".join(master_nodes) + " -o json")
    all_master_info = json.loads(all_master_info)
    for node_info in all_master_info["items"]:
        node = node_info["metadata"]["name"]
        NoSchedule_taint = False
        try:
            for taint in node_info["spec"]["taints"]:
                if taint["key"] == "node-role.kubernetes.io/master" and \
                    taint["effect"] == "NoSchedule":
                    NoSchedule_taint = True
                    break
            if not NoSchedule_taint:
                schedulable_masters.append(node)
        except Exception:
            schedulable_masters.append(node)
    return schedulable_masters
Exemplo n.º 9
0
def check_master_taint(master_nodes, master_label):
    schedulable_masters = []
    all_master_info = runcommand.invoke("kubectl get nodes " + " ".join(master_nodes) + " -o json", cmd_timeout)
    all_master_info = json.loads(all_master_info)
    if len(master_nodes) > 1:
        all_master_info = all_master_info["items"]
    else:
        all_master_info = [all_master_info]
    for node_info in all_master_info:
        node = node_info["metadata"]["name"]
        NoSchedule_taint = False
        try:
            for taint in node_info["spec"]["taints"]:
                if taint["key"] == str(master_label) and taint["effect"] == "NoSchedule":
                    NoSchedule_taint = True
                    break
            if not NoSchedule_taint:
                schedulable_masters.append(node)
        except Exception:
            schedulable_masters.append(node)
    return schedulable_masters
Exemplo n.º 10
0
def main():
    logging.info("------------------- Start Custom Checks -------------------")

    # set http client:
    global h
    h = httplib2.Http(disable_ssl_certificate_validation=True)

    # get cluster API url:
    global cluster_api_url
    cluster_api_url = runcommand.invoke(
        "kubectl cluster-info | awk 'NR==1' | sed -r "
        "'s/\x1B\[([0-9]{1,3}(;[0-9]{1,2})?)?[mGK]//g'")

    check1 = check_nodes()
    check2 = check_cluster_readyz()
    check3 = check_image_registry_and_routing()
    check4 = check_storage()
    logging.info(
        "------------------- Finished Custom Checks -------------------")

    return check1 & check2 & check3 & check4
Exemplo n.º 11
0
def main(cfg):
    # Start cerberus
    print(pyfiglet.figlet_format("cerberus"))
    logging.info("Starting ceberus")

    # Parse and read the config
    if os.path.isfile(cfg):
        with open(cfg, 'r') as f:
            config = yaml.full_load(f)
        distribution = config["cerberus"].get("distribution",
                                              "openshift").lower()
        kubeconfig_path = config["cerberus"].get("kubeconfig_path", "")
        watch_nodes = config["cerberus"].get("watch_nodes", False)
        watch_cluster_operators = config["cerberus"].get(
            "watch_cluster_operators", False)
        watch_namespaces = config["cerberus"].get("watch_namespaces", [])
        watch_url_routes = config["cerberus"].get("watch_url_routes", [])
        cerberus_publish_status = config["cerberus"].get(
            "cerberus_publish_status", False)
        inspect_components = config["cerberus"].get("inspect_components",
                                                    False)
        slack_integration = config["cerberus"].get("slack_integration", False)
        prometheus_url = config["cerberus"].get("prometheus_url", "")
        prometheus_bearer_token = config["cerberus"].get(
            "prometheus_bearer_token", "")
        iterations = config["tunings"].get("iterations", 0)
        sleep_time = config["tunings"].get("sleep_time", 0)
        daemon_mode = config["tunings"].get("daemon_mode", False)

        # Initialize clients
        if not os.path.isfile(kubeconfig_path):
            kubeconfig_path = None
        logging.info("Initializing client to talk to the Kubernetes cluster")
        kubecli.initialize_clients(kubeconfig_path)

        if "openshift-sdn" in watch_namespaces:
            sdn_namespace = kubecli.check_sdn_namespace()
            watch_namespaces = [
                namespace.replace('openshift-sdn', sdn_namespace)
                for namespace in watch_namespaces
            ]

        # Cluster info
        logging.info("Fetching cluster info")
        if distribution == "openshift":
            cluster_version = runcommand.invoke("kubectl get clusterversion")
            logging.info("\n%s" % (cluster_version))
        cluster_info = runcommand.invoke(
            "kubectl cluster-info | awk 'NR==1' | sed -r "
            "'s/\x1B\[([0-9]{1,3}(;[0-9]{1,2})?)?[mGK]//g'")  # noqa
        logging.info("%s" % (cluster_info))

        # Run http server using a separate thread if cerberus is asked
        # to publish the status. It is served by the http server.
        if cerberus_publish_status:
            address = ("0.0.0.0", 8080)
            server_address = address[0]
            port = address[1]
            logging.info("Publishing cerberus status at http://%s:%s" %
                         (server_address, port))
            server.start_server(address)

        # Create slack WebCleint when slack intergation has been enabled
        if slack_integration:
            slack_integration = slackcli.initialize_slack_client()

        # Run inspection only when the distribution is openshift
        if distribution == "openshift" and inspect_components:
            logging.info(
                "Detailed inspection of failed components has been enabled")
            inspect.delete_inspect_directory()

        # get list of all master nodes to verify scheduling
        master_nodes = kubecli.list_nodes("node-role.kubernetes.io/master")

        # Use cluster_info to get the api server url
        api_server_url = cluster_info.split(" ")[-1].strip() + "/healthz"

        # Counter for if api server is not ok
        api_fail_count = 0

        # Initialize the start iteration to 0
        iteration = 0

        # Set the number of iterations to loop to infinity if daemon mode is
        # enabled or else set it to the provided iterations count in the config
        if daemon_mode:
            logging.info("Daemon mode enabled, cerberus will monitor forever")
            logging.info("Ignoring the iterations set\n")
            iterations = float('inf')
        else:
            iterations = int(iterations)

        # Loop to run the components status checks starts here
        while (int(iteration) < iterations):
            # Initialize a dict to store the operations timings per iteration
            iter_track_time = {}
            # Capture the start time
            iteration_start_time = time.time()
            iteration += 1

            # Read the config for info when slack integration is enabled
            if slack_integration:
                weekday = runcommand.invoke("date '+%A'")[:-1]
                cop_slack_member_ID = config["cerberus"]["cop_slack_ID"].get(
                    weekday, None)
                slack_team_alias = config["cerberus"].get(
                    "slack_team_alias", None)
                slackcli.slack_tagging(cop_slack_member_ID, slack_team_alias)

                if iteration == 1:
                    slackcli.slack_report_cerberus_start(
                        cluster_info, weekday, cop_slack_member_ID)

            # Check if api server url is ok
            server_status = kubecli.is_url_available(api_server_url)
            if not server_status:
                api_fail_count += 1

            # Check for NoSchedule taint in all the master nodes once in every 10 iterations
            if iteration % 10 == 1:
                check_taint_start_time = time.time()
                schedulable_masters = kubecli.check_master_taint(master_nodes)
                iter_track_time['check_master_taint'] = time.time(
                ) - check_taint_start_time
                if schedulable_masters:
                    logging.warning(
                        "Iteration %s: Masters without NoSchedule taint: %s\n"
                        % (iteration, schedulable_masters))

            # Monitor nodes status
            if watch_nodes:
                watch_nodes_start_time = time.time()
                watch_nodes_status, failed_nodes = kubecli.monitor_nodes()
                iter_track_time['watch_nodes'] = time.time(
                ) - watch_nodes_start_time
                logging.info("Iteration %s: Node status: %s" %
                             (iteration, watch_nodes_status))
            else:
                logging.info(
                    "Cerberus is not monitoring nodes, so setting the status "
                    "to True and assuming that the nodes are ready")
                watch_nodes_status = True

            # Monitor cluster operators status
            if distribution == "openshift" and watch_cluster_operators:
                watch_co_start_time = time.time()
                status_yaml = kubecli.get_cluster_operators()
                watch_cluster_operators_status, failed_operators = \
                    kubecli.monitor_cluster_operator(status_yaml)
                iter_track_time['watch_cluster_operators'] = time.time(
                ) - watch_co_start_time
                logging.info("Iteration %s: Cluster Operator status: %s" %
                             (iteration, watch_cluster_operators_status))
            else:
                watch_cluster_operators_status = True

            if iteration == 1:
                for namespace in watch_namespaces:
                    kubecli.namespace_sleep_tracker(namespace)

            failed_pods_components = {}
            failed_pod_containers = {}
            watch_namespaces_status = True

            # Monitor each component in the namespace
            watch_namespaces_start_time = time.time()
            for namespace in watch_namespaces:
                watch_component_status, failed_component_pods, failed_containers = \
                    kubecli.monitor_namespace(namespace)
                logging.info("Iteration %s: %s: %s" %
                             (iteration, namespace, watch_component_status))
                watch_namespaces_status = watch_namespaces_status and watch_component_status
                if not watch_component_status:
                    failed_pods_components[namespace] = failed_component_pods
                    failed_pod_containers[namespace] = failed_containers

            iter_track_time['watch_namespaces'] = time.time(
            ) - watch_namespaces_start_time

            failed_routes = []
            if watch_url_routes:
                watch_routes_start_time = time.time()
                for route_info in watch_url_routes:
                    # Might need to get different authorization types here
                    header = {'Accept': 'application/json'}
                    if len(route_info) > 1:
                        header['Authorization'] = route_info[1]
                    route_status = kubecli.is_url_available(
                        route_info[0], header)
                    if not route_status:
                        failed_routes.append(route_info[0])
                iter_track_time['watch_routes'] = time.time(
                ) - watch_routes_start_time

            # Check for the number of hits
            if cerberus_publish_status:
                logging.info("HTTP requests served: %s \n" %
                             (server.SimpleHTTPRequestHandler.requests_served))

            # Logging the failed components
            if not watch_nodes_status:
                logging.info("Iteration %s: Failed nodes" % (iteration))
                logging.info("%s\n" % (failed_nodes))

            if not watch_cluster_operators_status:
                logging.info("Iteration %s: Failed operators" % (iteration))
                logging.info("%s\n" % (failed_operators))

            if not server_status:
                logging.info("Api Server is not healthy as reported by %s" %
                             (api_server_url))

            if not watch_namespaces_status:
                logging.info("Iteration %s: Failed pods and components" %
                             (iteration))
                for namespace, failures in failed_pods_components.items():
                    logging.info("%s: %s", namespace, failures)
                    for pod, containers in failed_pod_containers[
                            namespace].items():
                        logging.info("Failed containers in %s: %s", pod,
                                     containers)
                logging.info("")

            # Logging the failed checking of routes
            if failed_routes:
                logging.info("Iteration %s: Failed route monitoring" %
                             iteration)
                for route in failed_routes:
                    logging.info("Route url: %s" % route)
                logging.info("")

            # Report failures in a slack channel
            if not watch_nodes_status or not watch_namespaces_status or \
                    not watch_cluster_operators_status:
                if slack_integration:
                    slackcli.slack_logging(cluster_info, iteration,
                                           watch_nodes_status, failed_nodes,
                                           watch_cluster_operators_status,
                                           failed_operators,
                                           watch_namespaces_status,
                                           failed_pods_components)

            # Run inspection only when the distribution is openshift
            if distribution == "openshift" and inspect_components:
                inspect.inspect_components(failed_pods_components)
            elif distribution == "kubernetes" and inspect_components:
                logging.info("Skipping the failed components inspection as "
                             "it's specific to OpenShift")

            # Aggregate the status and publish it
            cerberus_status = watch_nodes_status and watch_namespaces_status \
                and watch_cluster_operators_status and server_status

            if cerberus_publish_status:
                publish_cerberus_status(cerberus_status)

            # Alert on high latencies
            # Intialize prometheus client
            if distribution == "openshift" and not prometheus_url:
                url = runcommand.invoke(
                    r"""oc get routes -n openshift-monitoring -o=jsonpath='{.items[?(@.metadata.name=="prometheus-k8s")].spec.host}'"""
                )  # noqa
                prometheus_url = "https://" + url
            if distribution == "openshift" and not prometheus_bearer_token:
                prometheus_bearer_token = runcommand.invoke(
                    "oc -n openshift-monitoring sa get-token prometheus-k8s"
                )  # noqa
            if prometheus_url and prometheus_bearer_token:
                promcli.initialize_prom_client(prometheus_url,
                                               prometheus_bearer_token)
                # Check for high latency alerts
                query = r"""ALERTS{alertname="KubeAPILatencyHigh", severity="warning"}"""
                metrics = promcli.get_metrics(query)
                if metrics:
                    logging.warning(
                        "Kubernetes API server latency is high. "
                        "More than 99th percentile latency for given requests to the kube-apiserver is above 1 second.\n"
                    )  # noqa
                    logging.info("%s\n" % (metrics))
            else:
                logging.info(
                    "Skipping the alerts check as the prometheus url and bearer token are not provided\n"
                )  # noqa

            # Sleep for the specified duration
            logging.info("Sleeping for the specified duration: %s\n" %
                         (sleep_time))
            time.sleep(float(sleep_time))

            crashed_restarted_pods = defaultdict(list)

            for namespace in watch_namespaces:
                crashed_restarted_pods.update(
                    kubecli.namespace_sleep_tracker(namespace))

            if crashed_restarted_pods:
                logging.info(
                    "Pods that were crashed/restarted during the sleep interval of "
                    "iteration %s" % (iteration))
                for namespace, pods in crashed_restarted_pods.items():
                    logging.info("%s: %s" % (namespace, pods))
                logging.info("")

            # Capture total time taken by the iteration
            iter_track_time['entire_iteration'] = (
                time.time() - iteration_start_time) - sleep_time  # noqa

            # Print the captured timing for each operation
            logging.info(
                "-------------------------- Iteration Stats ---------------------------"
            )
            for operation, timing in iter_track_time.items():
                logging.info(
                    "Time taken to run %s in iteration %s: %s seconds" %
                    (operation, iteration, timing))
            logging.info(
                "----------------------------------------------------------------------\n"
            )

        else:
            logging.info(
                "Completed watching for the specified number of iterations: %s"
                % (iterations))
    else:
        logging.error("Could not find a config at %s, please check" % (cfg))
        sys.exit(1)
Exemplo n.º 12
0
def get_all_pod_info(namespace):
    all_pod_info = runcommand.invoke("kubectl get pods -n " + namespace +
                                     " -o json")
    all_pod_info = json.loads(all_pod_info)
    return all_pod_info
Exemplo n.º 13
0
def get_cluster_operators():
    operators_status = runcommand.invoke("kubectl get co -o yaml")
    status_yaml = yaml.load(operators_status, Loader=yaml.FullLoader)
    return status_yaml
Exemplo n.º 14
0
def create_db():
    if os.path.isfile(db_path):
        runcommand.invoke("rm " + db_path)
    sqlite3.connect(db_path)
Exemplo n.º 15
0
def get_all_pod_info(namespace):
    all_pod_info = runcommand.invoke(
        "kubectl get pods --chunk-size " + request_chunk_size + " -n " + namespace + " -o json", cmd_timeout
    )
    all_pod_info = json.loads(all_pod_info)
    return all_pod_info
Exemplo n.º 16
0
def get_all_nodes_info():
    nodes_info = runcommand.invoke("kubectl get nodes --chunk-size " + request_chunk_size + " -o json", cmd_timeout)
    nodes_info = json.loads(nodes_info)
    return nodes_info
Exemplo n.º 17
0
def get_csrs():
    csr_string = runcommand.invoke("oc get csr -o yaml", cmd_timeout)
    csr_yaml = yaml.load(csr_string, Loader=yaml.FullLoader)
    return csr_yaml
Exemplo n.º 18
0
def main(cfg):
    # Start cerberus
    print(pyfiglet.figlet_format("cerberus"))
    logging.info("Starting ceberus")

    # Parse and read the config
    if os.path.isfile(cfg):
        with open(cfg, 'r') as f:
            config = yaml.full_load(f)
        watch_nodes = config["cerberus"]["watch_nodes"]
        cerberus_publish_status = \
            config["cerberus"]["cerberus_publish_status"]
        watch_namespaces = config["cerberus"]["watch_namespaces"]
        kubeconfig_path = config["cerberus"]["kubeconfig_path"]
        inspect_components = config["cerberus"]["inspect_components"]
        slack_integration = config["cerberus"]["slack_integration"]
        iterations = config["tunings"]["iterations"]
        sleep_time = config["tunings"]["sleep_time"]
        daemon_mode = config["tunings"]["daemon_mode"]

        # Initialize clients
        if not os.path.isfile(kubeconfig_path):
            kubeconfig_path = None
        logging.info("Initializing client to talk to the Kubernetes cluster")
        kubecli.initialize_clients(kubeconfig_path)
        if "openshift-sdn" in watch_namespaces:
            sdn_namespace = kubecli.check_sdn_namespace()
            watch_namespaces = [
                w.replace('openshift-sdn', sdn_namespace)
                for w in watch_namespaces
            ]

        # Cluster info
        logging.info("Fetching cluster info")
        cluster_version = runcommand.invoke("kubectl get clusterversion")
        cluster_info = runcommand.invoke(
            "kubectl cluster-info | awk 'NR==1' | sed -r "
            "'s/\x1B\[([0-9]{1,3}(;[0-9]{1,2})?)?[mGK]//g'")  # noqa
        logging.info("\n%s%s" % (cluster_version, cluster_info))

        # Run http server using a separate thread
        # if cerberus is asked to publish the status.
        # It is served by the http server.
        if cerberus_publish_status:
            address = ("0.0.0.0", 8080)
            server_address = address[0]
            port = address[1]
            logging.info("Publishing cerberus status at http://%s:%s" %
                         (server_address, port))
            server.start_server(address)

        # Create slack WebCleint when slack intergation has been enabled
        if slack_integration:
            try:
                slackcli.initialize_slack_client()
            except Exception as e:
                slack_integration = False
                logging.error(
                    "Couldn't create slack WebClient. Check if slack env "
                    "varaibles are set. Exception: %s" % (e))
                logging.info("Slack integration has been disabled.")

        # Remove 'inspect_data' directory if it exists.
        # 'inspect_data' directory is used to collect
        # logs, events and metrics of the failed component
        if os.path.isdir("inspect_data/"):
            logging.info("Deleting existing inspect_data directory")
            runcommand.invoke("rm -R inspect_data")

        # Initialize the start iteration to 0
        iteration = 0

        # Set the number of iterations to loop to infinity
        # if daemon mode is enabled
        # or else set it to the provided iterations count in the config
        if daemon_mode:
            logging.info("Daemon mode enabled, cerberus will monitor forever")
            logging.info("Ignoring the iterations set")
            iterations = float('inf')
        else:
            iterations = int(iterations)

        # Loop to run the components status checks starts here
        while (int(iteration) < iterations):
            iteration += 1
            print("\n")

            if slack_integration:
                weekday = runcommand.invoke("date '+%A'")[:-1]
                cop_slack_member_ID = config["cerberus"]["cop_slack_ID"][
                    weekday]
                valid_cops = slackcli.get_channel_members()['members']
                slack_team_alias = config["cerberus"]["slack_team_alias"]

                if cop_slack_member_ID in valid_cops:
                    slack_tag = "<@" + cop_slack_member_ID + ">"
                elif slack_team_alias:
                    slack_tag = "@" + slack_team_alias + " "
                else:
                    slack_tag = ""

                if iteration == 1:
                    if cop_slack_member_ID in valid_cops:
                        slack_tag = "Hi " + slack_tag + "! The cop " \
                                    "for " + weekday + "!\n"
                    slackcli.post_message_in_slack(
                        slack_tag + "Cerberus has started monitoring! "
                        ":skull_and_crossbones: %s" % (cluster_info))

            # Monitor nodes status
            if watch_nodes:
                watch_nodes_status, failed_nodes = kubecli.monitor_nodes()
                logging.info("Iteration %s: Node status: %s" %
                             (iteration, watch_nodes_status))
            else:
                logging.info("Cerberus is not monitoring nodes, "
                             "so setting the status to True and "
                             "assuming that the nodes are ready")
                watch_nodes_status = True

            # Monitor each component in the namespace
            # Set the initial cerberus_status
            failed_pods_components = {}
            cerberus_status = True

            for namespace in watch_namespaces:
                watch_component_status, failed_component_pods = \
                    kubecli.monitor_component(iteration, namespace)
                cerberus_status = cerberus_status and watch_component_status
                if not watch_component_status:
                    failed_pods_components[namespace] = failed_component_pods

            # Check for the number of hits
            if cerberus_publish_status:
                logging.info("HTTP requests served: %s \n" %
                             (server.SimpleHTTPRequestHandler.requests_served))

            # Logging the failed components
            if not watch_nodes_status:
                logging.info("Failed nodes")
                logging.info("%s" % (failed_nodes))

            if not cerberus_status:
                logging.info("Failed pods and components")
                for namespace, failures in failed_pods_components.items():
                    logging.info("%s: %s \n", namespace, failures)
                if slack_integration:
                    failed_namespaces = ", ".join(
                        list(failed_pods_components.keys()))
                    valid_cops = slackcli.get_channel_members()['members']
                    cerberus_report_path = runcommand.invoke(
                        "pwd | tr -d '\n'")
                    slackcli.post_message_in_slack(
                        slack_tag + " %sIn iteration %d, cerberus "
                        "found issues in namespaces: *%s*. Hence, "
                        "setting the go/no-go signal to false. The "
                        "full report is at *%s* on the host cerberus "
                        "is running." %
                        (cluster_info, iteration, failed_namespaces,
                         cerberus_report_path))

            if inspect_components:
                for namespace in failed_pods_components.keys():
                    dir_name = "inspect_data/" + namespace + "-logs"
                    if os.path.isdir(dir_name):
                        runcommand.invoke("rm -R " + dir_name)
                        logging.info("Deleted existing %s directory" %
                                     (dir_name))
                    command_out = runcommand.invoke("oc adm inspect ns/" +
                                                    namespace + " --dest"
                                                    "-dir=" + dir_name)
                    logging.info(command_out)

            if cerberus_publish_status:
                publish_cerberus_status(cerberus_status)

            # Sleep for the specified duration
            logging.info("Sleeping for the "
                         "specified duration: %s" % (sleep_time))
            time.sleep(float(sleep_time))

        else:
            logging.info(
                "Completed watching for the specified number of iterations: %s"
                % (iterations))
    else:
        logging.error("Could not find a config at %s, please check" % (cfg))
        sys.exit(1)
Exemplo n.º 19
0
def delete_inspect_directory():
    if os.path.isdir("inspect_data/"):
        logging.info("Deleting existing inspect_data directory")
        runcommand.invoke("rm -R inspect_data")
Exemplo n.º 20
0
def main(cfg):
    # Start cerberus
    print(pyfiglet.figlet_format("cerberus"))
    logging.info("Starting ceberus")

    # Parse and read the config
    if os.path.isfile(cfg):
        with open(cfg, "r") as f:
            config = yaml.full_load(f)
        distribution = config["cerberus"].get("distribution",
                                              "openshift").lower()
        kubeconfig_path = config["cerberus"].get("kubeconfig_path", "")
        port = config["cerberus"].get("port", 8080)
        watch_nodes = config["cerberus"].get("watch_nodes", False)
        watch_cluster_operators = config["cerberus"].get(
            "watch_cluster_operators", False)
        watch_namespaces = config["cerberus"].get("watch_namespaces", [])
        watch_url_routes = config["cerberus"].get("watch_url_routes", [])
        watch_master_schedulable = config["cerberus"].get(
            "watch_master_schedulable", {})
        cerberus_publish_status = config["cerberus"].get(
            "cerberus_publish_status", False)
        inspect_components = config["cerberus"].get("inspect_components",
                                                    False)
        slack_integration = config["cerberus"].get("slack_integration", False)
        prometheus_url = config["cerberus"].get("prometheus_url", "")
        prometheus_bearer_token = config["cerberus"].get(
            "prometheus_bearer_token", "")
        custom_checks = config["cerberus"].get("custom_checks", [])
        iterations = config["tunings"].get("iterations", 0)
        sleep_time = config["tunings"].get("sleep_time", 0)
        cmd_timeout = config["tunings"].get("timeout", 60)
        request_chunk_size = config["tunings"].get(
            "kube_api_request_chunk_size", 250)
        daemon_mode = config["tunings"].get("daemon_mode", False)
        cores_usage_percentage = config["tunings"].get(
            "cores_usage_percentage", 0.5)
        database_path = config["database"].get("database_path",
                                               "/tmp/cerberus.db")
        reuse_database = config["database"].get("reuse_database", False)
        # Initialize custom checks vars
        custom_checks_status = True
        custom_checks_fail_messages = []

        # Initialize clients and set kube api request chunk size
        if not os.path.isfile(kubeconfig_path):
            kubeconfig_path = None
        logging.info("Initializing client to talk to the Kubernetes cluster")
        kubecli.initialize_clients(kubeconfig_path, request_chunk_size,
                                   cmd_timeout)

        if "openshift-sdn" in watch_namespaces:
            sdn_namespace = kubecli.check_sdn_namespace()
            watch_namespaces = [
                namespace.replace("openshift-sdn", sdn_namespace)
                for namespace in watch_namespaces
            ]

        # Check if all the namespaces under watch_namespaces are valid
        watch_namespaces = kubecli.check_namespaces(watch_namespaces)

        # Cluster info
        logging.info("Fetching cluster info")
        if distribution == "openshift":
            oc_version = runcommand.optional_invoke("oc version")
            logging.info("oc version:\n%s" % oc_version)

            cluster_version = runcommand.optional_invoke(
                "oc get clusterversion")
            logging.info("oc get clusterversion:\n%s" % cluster_version)

        cluster_info = runcommand.invoke(
            "kubectl cluster-info | awk 'NR==1' | sed -r "
            "'s/\x1B\[([0-9]{1,3}(;[0-9]{1,2})?)?[mGK]//g'")  # noqa
        logging.info("%s" % (cluster_info))

        # Run http server using a separate thread if cerberus is asked
        # to publish the status. It is served by the http server.
        if cerberus_publish_status:
            if not 0 <= port <= 65535:
                logging.info(
                    "Using port 8080 as %s isn't a valid port number" % (port))
                port = 8080
            address = ("0.0.0.0", port)
            server_address = address[0]
            port = address[1]
            logging.info("Publishing cerberus status at http://%s:%s" %
                         (server_address, port))
            server.start_server(address)

        dbcli.set_db_path(database_path)
        if not os.path.isfile(database_path) or not reuse_database:
            dbcli.create_db()
            dbcli.create_table()

        # Create slack WebCleint when slack intergation has been enabled
        if slack_integration:
            slack_integration = slackcli.initialize_slack_client()

        # Run inspection only when the distribution is openshift
        if distribution == "openshift" and inspect_components:
            logging.info(
                "Detailed inspection of failed components has been enabled")
            inspect.delete_inspect_directory()

        # get list of all master nodes with provided labels in the config
        master_nodes = []
        master_label = ""
        if watch_master_schedulable["enabled"]:
            master_label = watch_master_schedulable["label"]
            nodes = kubecli.list_nodes(master_label)
            if len(nodes) == 0:
                logging.error(
                    "No master node found for the label %s. Please check master node config."
                    % (master_label))  # noqa
                sys.exit(1)
            else:
                master_nodes.extend(nodes)

        # Use cluster_info to get the api server url
        api_server_url = cluster_info.split(" ")[-1].strip() + "/healthz"

        # Counter for if api server is not ok
        api_fail_count = 0

        # Variables used for multiprocessing
        global pool
        pool = multiprocessing.Pool(
            int(cores_usage_percentage * multiprocessing.cpu_count()),
            init_worker)
        manager = multiprocessing.Manager()

        # Track time taken for different checks in each iteration
        global time_tracker
        time_tracker = {}

        # Initialize the start iteration to 0
        iteration = 0

        # Initialize the prometheus client
        promcli.initialize_prom_client(distribution, prometheus_url,
                                       prometheus_bearer_token)

        # Prometheus query to alert on high apiserver latencies
        apiserver_latency_query = r"""ALERTS{alertname="KubeAPILatencyHigh", severity="warning"}"""
        # Prometheus query to alert when etcd fync duration is high
        etcd_leader_changes_query = r"""ALERTS{alertname="etcdHighNumberOfLeaderChanges", severity="warning"}"""  # noqa

        # Set the number of iterations to loop to infinity if daemon mode is
        # enabled or else set it to the provided iterations count in the config
        if daemon_mode:
            logging.info("Daemon mode enabled, cerberus will monitor forever")
            logging.info("Ignoring the iterations set\n")
            iterations = float("inf")
        else:
            iterations = int(iterations)

        # Loop to run the components status checks starts here
        while int(iteration) < iterations:
            try:
                # Initialize a dict to store the operations timings per iteration
                iter_track_time = manager.dict()

                # Capture the start time
                iteration_start_time = time.time()

                iteration += 1

                # Read the config for info when slack integration is enabled
                if slack_integration:
                    weekday = runcommand.invoke("date '+%A'")[:-1]
                    watcher_slack_member_ID = config["cerberus"][
                        "watcher_slack_ID"].get(weekday, None)
                    slack_team_alias = config["cerberus"].get(
                        "slack_team_alias", None)
                    slackcli.slack_tagging(watcher_slack_member_ID,
                                           slack_team_alias)

                    if iteration == 1:
                        slackcli.slack_report_cerberus_start(
                            cluster_info, weekday, watcher_slack_member_ID)

                # Collect the initial creation_timestamp and restart_count of all the pods in all
                # the namespaces in watch_namespaces
                if iteration == 1:
                    pods_tracker = manager.dict()
                    pool.starmap(kubecli.namespace_sleep_tracker,
                                 zip(watch_namespaces, repeat(pods_tracker)))

                # Execute the functions to check api_server_status, master_schedulable_status,
                # watch_nodes, watch_cluster_operators parallely
                (
                    (server_status),
                    (schedulable_masters),
                    (watch_nodes_status, failed_nodes),
                    (watch_cluster_operators_status, failed_operators),
                    (failed_routes),
                ) = pool.map(
                    smap,
                    [
                        functools.partial(kubecli.is_url_available,
                                          api_server_url),
                        functools.partial(kubecli.process_master_taint,
                                          master_nodes, master_label,
                                          iteration, iter_track_time),
                        functools.partial(kubecli.process_nodes, watch_nodes,
                                          iteration, iter_track_time),
                        functools.partial(
                            kubecli.process_cluster_operator,
                            distribution,
                            watch_cluster_operators,
                            iteration,
                            iter_track_time,
                        ),
                        functools.partial(kubecli.process_routes,
                                          watch_url_routes, iter_track_time),
                    ],
                )

                # Increment api_fail_count if api server url is not ok
                if not server_status:
                    api_fail_count += 1

                # Initialize a shared_memory of type dict to share data between different processes
                failed_pods_components = manager.dict()
                failed_pod_containers = manager.dict()

                # Monitor all the namespaces parallely
                watch_namespaces_start_time = time.time()
                pool.starmap(
                    kubecli.process_namespace,
                    zip(
                        repeat(iteration),
                        watch_namespaces,
                        repeat(failed_pods_components),
                        repeat(failed_pod_containers),
                    ),
                )

                watch_namespaces_status = False if failed_pods_components else True
                iter_track_time["watch_namespaces"] = time.time(
                ) - watch_namespaces_start_time

                # Check for the number of hits
                if cerberus_publish_status:
                    logging.info(
                        "HTTP requests served: %s \n" %
                        (server.SimpleHTTPRequestHandler.requests_served))

                if schedulable_masters:
                    logging.warning(
                        "Iteration %s: Masters without NoSchedule taint: %s\n"
                        % (iteration, schedulable_masters))

                # Logging the failed components
                if not watch_nodes_status:
                    logging.info("Iteration %s: Failed nodes" % (iteration))
                    logging.info("%s\n" % (failed_nodes))
                    dbcli.insert(datetime.now(), time.time(), 1, "not ready",
                                 failed_nodes, "node")

                if not watch_cluster_operators_status:
                    logging.info("Iteration %s: Failed operators" %
                                 (iteration))
                    logging.info("%s\n" % (failed_operators))
                    dbcli.insert(datetime.now(), time.time(), 1, "degraded",
                                 failed_operators, "cluster operator")

                if not server_status:
                    logging.info(
                        "Iteration %s: Api Server is not healthy as reported by %s\n"
                        % (iteration, api_server_url))
                    dbcli.insert(datetime.now(), time.time(), 1, "unavailable",
                                 list(api_server_url), "api server")

                if not watch_namespaces_status:
                    logging.info("Iteration %s: Failed pods and components" %
                                 (iteration))
                    for namespace, failures in failed_pods_components.items():
                        logging.info("%s: %s", namespace, failures)

                        for pod, containers in failed_pod_containers[
                                namespace].items():
                            logging.info("Failed containers in %s: %s", pod,
                                         containers)

                        component = namespace.split("-")
                        if component[0] == "openshift":
                            component = "-".join(component[1:])
                        else:
                            component = "-".join(component)
                        dbcli.insert(datetime.now(), time.time(), 1,
                                     "pod crash", failures, component)
                    logging.info("")

                # Logging the failed checking of routes
                watch_routes_status = True
                if failed_routes:
                    watch_routes_status = False
                    logging.info("Iteration %s: Failed route monitoring" %
                                 iteration)
                    for route in failed_routes:
                        logging.info("Route url: %s" % route)
                    logging.info("")
                    dbcli.insert(datetime.now(), time.time(), 1, "unavailable",
                                 failed_routes, "route")

                # Aggregate the status and publish it
                cerberus_status = (watch_nodes_status
                                   and watch_namespaces_status
                                   and watch_cluster_operators_status
                                   and server_status and watch_routes_status)

                if distribution == "openshift":
                    watch_csrs_start_time = time.time()
                    csrs = kubecli.get_csrs()
                    pending_csr = []
                    for csr in csrs["items"]:
                        # find csr status
                        if "conditions" in csr["status"]:
                            if "Approved" not in csr["status"]["conditions"][
                                    0]["type"]:
                                pending_csr.append(csr["metadata"]["name"])
                        else:
                            pending_csr.append(csr["metadata"]["name"])
                    if pending_csr:
                        logging.warning(
                            "There are CSR's that are currently not approved")
                        logging.warning("Csr's that are not approved: " +
                                        str(pending_csr))
                    iter_track_time["watch_csrs"] = time.time(
                    ) - watch_csrs_start_time

                if custom_checks:
                    if iteration == 1:
                        custom_checks_imports = []
                        for check in custom_checks:
                            my_check = ".".join(
                                check.replace("/", ".").split(".")[:-1])
                            my_check_module = importlib.import_module(my_check)
                            custom_checks_imports.append(my_check_module)
                    custom_checks_fail_messages = []
                    custom_checks_status = True
                    for check in custom_checks_imports:
                        check_returns = check.main()
                        if type(check_returns) == bool:
                            custom_checks_status = custom_checks_status and check_returns
                        elif type(check_returns) == dict:
                            status = check_returns["status"]
                            message = check_returns["message"]
                            custom_checks_status = custom_checks_status and status
                            custom_checks_fail_messages.append(message)
                    cerberus_status = cerberus_status and custom_checks_status

                if cerberus_publish_status:
                    publish_cerberus_status(cerberus_status)

                # Report failures in a slack channel
                if (not watch_nodes_status or not watch_namespaces_status
                        or not watch_cluster_operators_status
                        or not custom_checks_status):
                    if slack_integration:
                        slackcli.slack_logging(
                            cluster_info,
                            iteration,
                            watch_nodes_status,
                            failed_nodes,
                            watch_cluster_operators_status,
                            failed_operators,
                            watch_namespaces_status,
                            failed_pods_components,
                            custom_checks_status,
                            custom_checks_fail_messages,
                        )

                # Run inspection only when the distribution is openshift
                if distribution == "openshift" and inspect_components:
                    # Collect detailed logs for all the namespaces with failed
                    # components parallely
                    pool.map(inspect.inspect_component,
                             failed_pods_components.keys())
                    logging.info("")
                elif distribution == "kubernetes" and inspect_components:
                    logging.info(
                        "Skipping the failed components inspection as "
                        "it's specific to OpenShift")

                # Alert on high latencies
                metrics = promcli.process_prom_query(apiserver_latency_query)
                if metrics:
                    logging.warning(
                        "Kubernetes API server latency is high. "
                        "More than 99th percentile latency for given requests to the "
                        "kube-apiserver is above 1 second.\n")
                    logging.info("%s\n" % (metrics))

                # Alert on high etcd fync duration
                metrics = promcli.process_prom_query(etcd_leader_changes_query)
                if metrics:
                    logging.warning(
                        "Observed increase in number of etcd leader elections over the last "
                        "15 minutes. Frequent elections may be a sign of insufficient resources, "
                        "high network latency, or disruptions by other components and should be "
                        "investigated.\n")
                logging.info("%s\n" % (metrics))

                # Sleep for the specified duration
                logging.info("Sleeping for the specified duration: %s\n" %
                             (sleep_time))
                time.sleep(float(sleep_time))

                sleep_tracker_start_time = time.time()

                # Track pod crashes/restarts during the sleep interval in all namespaces parallely
                multiprocessed_output = pool.starmap(
                    kubecli.namespace_sleep_tracker,
                    zip(watch_namespaces, repeat(pods_tracker)))

                crashed_restarted_pods = {}
                for item in multiprocessed_output:
                    crashed_restarted_pods.update(item)

                iter_track_time["sleep_tracker"] = time.time(
                ) - sleep_tracker_start_time

                if crashed_restarted_pods:
                    logging.info(
                        "Pods that were crashed/restarted during the sleep interval of "
                        "iteration %s" % (iteration))
                    for namespace, pods in crashed_restarted_pods.items():
                        distinct_pods = set(pod[0] for pod in pods)
                        logging.info("%s: %s" % (namespace, distinct_pods))
                        component = namespace.split("-")
                        if component[0] == "openshift":
                            component = "-".join(component[1:])
                        else:
                            component = "-".join(component)
                        for pod in pods:
                            if pod[1] == "crash":
                                dbcli.insert(datetime.now(), time.time(), 1,
                                             "pod crash", [pod[0]], component)
                            elif pod[1] == "restart":
                                dbcli.insert(datetime.now(), time.time(),
                                             pod[2], "pod restart", [pod[0]],
                                             component)
                    logging.info("")

                # Capture total time taken by the iteration
                iter_track_time["entire_iteration"] = (
                    time.time() - iteration_start_time) - sleep_time  # noqa

                time_tracker["Iteration " +
                             str(iteration)] = iter_track_time.copy()

                # Print the captured timing for each operation
                logging.info(
                    "-------------------------- Iteration Stats ---------------------------"
                )  # noqa
                for operation, timing in iter_track_time.items():
                    logging.info(
                        "Time taken to run %s in iteration %s: %s seconds" %
                        (operation, iteration, timing))
                logging.info(
                    "----------------------------------------------------------------------\n"
                )  # noqa

            except KeyboardInterrupt:
                pool.terminate()
                pool.join()
                logging.info("Terminating cerberus monitoring")
                record_time(time_tracker)
                sys.exit(1)

            except Exception as e:
                logging.info(
                    "Encountered issues in cluster. Hence, setting the go/no-go "
                    "signal to false")
                logging.info("Exception: %s\n" % (e))
                if cerberus_publish_status:
                    publish_cerberus_status(False)
                continue

        else:
            logging.info(
                "Completed watching for the specified number of iterations: %s"
                % (iterations))
            record_time(time_tracker)
            pool.close()
            pool.join()
    else:
        logging.error("Could not find a config at %s, please check" % (cfg))
        sys.exit(1)