def check_flink_service_health( instance_config: FlinkDeploymentConfig, all_tasks_or_pods: Sequence[V1Pod], replication_checker: KubeSmartstackEnvoyReplicationChecker, dry_run: bool = False, ) -> None: si_pods = filter_pods_by_service_instance( pod_list=all_tasks_or_pods, service=instance_config.service, instance=instance_config.instance, ) taskmanagers_expected_cnt = instance_config.config_dict.get( "taskmanager", {"instances": 10} ).get("instances", 10) num_healthy_supervisors = healthy_flink_containers_cnt(si_pods, "supervisor") num_healthy_jobmanagers = healthy_flink_containers_cnt(si_pods, "jobmanager") num_healthy_taskmanagers = healthy_flink_containers_cnt(si_pods, "taskmanager") service_cr_name = get_cr_name(si_pods) results = [ check_under_replication( instance_config=instance_config, expected_count=1, num_available=num_healthy_supervisors, sub_component="supervisor", ), check_under_replication( instance_config=instance_config, expected_count=1, num_available=num_healthy_jobmanagers, sub_component="jobmanager", ), check_under_replication( instance_config=instance_config, expected_count=taskmanagers_expected_cnt, num_available=num_healthy_taskmanagers, sub_component="taskmanager", ), check_under_registered_taskmanagers( instance_config=instance_config, expected_count=taskmanagers_expected_cnt, cr_name=service_cr_name, ), ] output = ", ".join([r[1] for r in results]) description = "\n########\n".join([r[2] for r in results]) if any(r[0] for r in results): log.error(output) status = pysensu_yelp.Status.CRITICAL else: log.info(output) status = pysensu_yelp.Status.OK send_replication_event( instance_config=instance_config, status=status, output=output, description=description, dry_run=dry_run, )
def check_flink_service_health( instance_config: FlinkDeploymentConfig, all_pods: Sequence[V1Pod], smartstack_replication_checker: KubeSmartstackReplicationChecker, ) -> None: si_pods = filter_pods_by_service_instance( pod_list=all_pods, service=instance_config.service, instance=instance_config.instance, ) taskmanagers_expected_cnt = instance_config.config_dict.get( "taskmanager", { "instances": 10 }).get("instances", 10) num_healthy_supervisors = healthy_flink_containers_cnt( si_pods, "supervisor") num_healthy_jobmanagers = healthy_flink_containers_cnt( si_pods, "jobmanager") num_healthy_taskmanagers = healthy_flink_containers_cnt( si_pods, "taskmanager") strerror = None reported_taskmanagers = None try: overview = flink_tools.get_flink_jobmanager_overview( instance_config.service, instance_config.instance, instance_config.cluster) reported_taskmanagers = overview.get("taskmanagers", 0) except ValueError as e: strerror = str(e) send_event_if_not_enough_taskmanagers( instance_config=instance_config, expected_count=taskmanagers_expected_cnt, num_reported=reported_taskmanagers, strerror=strerror, ) monitoring_tools.send_replication_event_if_under_replication( instance_config=instance_config, expected_count=1, num_available=num_healthy_supervisors, sub_component="supervisor", ) monitoring_tools.send_replication_event_if_under_replication( instance_config=instance_config, expected_count=1, num_available=num_healthy_jobmanagers, sub_component="jobmanager", ) monitoring_tools.send_replication_event_if_under_replication( instance_config=instance_config, expected_count=taskmanagers_expected_cnt, num_available=num_healthy_taskmanagers, sub_component="taskmanager", )
def check_healthy_kubernetes_tasks_for_service_instance( instance_config: KubernetesDeploymentConfig, expected_count: int, all_pods: Sequence[V1Pod], ) -> None: si_pods = filter_pods_by_service_instance( pod_list=all_pods, service=instance_config.service, instance=instance_config.instance, ) num_healthy_tasks = len([pod for pod in si_pods if is_pod_ready(pod)]) log.info( f"Checking {instance_config.service}.{instance_config.instance} in kubernetes as it is not in smartstack" ) monitoring_tools.send_replication_event_if_under_replication( instance_config=instance_config, expected_count=expected_count, num_available=num_healthy_tasks, )
def check_flink_service_replication( instance_config: FlinkDeploymentConfig, all_pods: Sequence[V1Pod], smartstack_replication_checker: KubeSmartstackReplicationChecker, ) -> None: si_pods = filter_pods_by_service_instance( pod_list=all_pods, service=instance_config.service, instance=instance_config.instance, ) taskmanagers_expected_cnt = instance_config.config_dict.get( "taskmanager", {"instances": 10} ).get("instances", 10) num_healthy_supervisors = healthy_flink_containers_cnt(si_pods, "supervisor") num_healthy_jobmanagers = healthy_flink_containers_cnt(si_pods, "jobmanager") num_healthy_taskmanagers = healthy_flink_containers_cnt(si_pods, "taskmanager") # TBD: check cnt according to Flink monitoring_tools.send_replication_event_if_under_replication( instance_config=instance_config, expected_count=1, num_available=num_healthy_supervisors, sub_component="supervisor", ) monitoring_tools.send_replication_event_if_under_replication( instance_config=instance_config, expected_count=1, num_available=num_healthy_jobmanagers, sub_component="jobmanager", ) monitoring_tools.send_replication_event_if_under_replication( instance_config=instance_config, expected_count=taskmanagers_expected_cnt, num_available=num_healthy_taskmanagers, sub_component="taskmanager", )