def test_send_replication_event_if_under_replication_critical(instance_config): with mock.patch( 'paasta_tools.monitoring_tools.send_replication_event', autospec=True, ) as mock_send_event: monitoring_tools.send_replication_event_if_under_replication( instance_config=instance_config, expected_count=100, num_available=89, ) mock_send_event.assert_called_once_with( instance_config=instance_config, status=2, output=mock.ANY, ) _, send_event_kwargs = mock_send_event.call_args alert_output = send_event_kwargs["output"] assert ( "{} has 89 out of 100 expected instances available!\n(threshold: 90%)" .format(instance_config.job_id)) in alert_output assert ("paasta status -s {} -i {} -c {} -vv".format( instance_config.service, instance_config.instance, instance_config.cluster, )) in alert_output
def test_send_replication_event_if_under_replication_handles_0_expected( instance_config, ): with mock.patch( "paasta_tools.monitoring_tools.send_replication_event", autospec=True ) as mock_send_event: monitoring_tools.send_replication_event_if_under_replication( instance_config=instance_config, expected_count=0, num_available=0, dry_run=True, ) mock_send_event.assert_called_once_with( instance_config=instance_config, status=0, output=mock.ANY, description=mock.ANY, dry_run=True, ) _, send_event_kwargs = mock_send_event.call_args alert_output = send_event_kwargs["output"] assert ( "{} has 0/0 replicas available (threshold: 90%)".format( instance_config.job_id ) ) in alert_output
def check_healthy_marathon_tasks_for_service_instance(instance_config, expected_count, all_tasks): app_id = format_job_id(instance_config.service, instance_config.instance) num_healthy_tasks = filter_healthy_marathon_instances_for_short_app_id( all_tasks=all_tasks, app_id=app_id) log.info("Checking %s in marathon as it is not in smartstack" % app_id) monitoring_tools.send_replication_event_if_under_replication( instance_config=instance_config, expected_count=expected_count, num_available=num_healthy_tasks, )
def test_send_replication_event_if_under_replication_good(instance_config): with mock.patch("paasta_tools.monitoring_tools.send_replication_event", autospec=True) as mock_send_event: monitoring_tools.send_replication_event_if_under_replication( instance_config=instance_config, expected_count=100, num_available=100) mock_send_event.assert_called_once_with( instance_config=instance_config, status=0, output=mock.ANY) _, send_event_kwargs = mock_send_event.call_args alert_output = send_event_kwargs["output"] assert ( "{} has 100 out of 100 expected instances available!\n(threshold: 90%)" .format(instance_config.job_id)) in alert_output
def check_healthy_kubernetes_tasks_for_service_instance( instance_config: KubernetesDeploymentConfig, expected_count: int, all_pods: Sequence[V1Pod], ) -> None: si_pods = filter_pods_by_service_instance( pod_list=all_pods, service=instance_config.service, instance=instance_config.instance, ) num_healthy_tasks = len([pod for pod in si_pods if is_pod_ready(pod)]) log.info( f"Checking {instance_config.service}.{instance_config.instance} in kubernetes as it is not in smartstack" ) monitoring_tools.send_replication_event_if_under_replication( instance_config=instance_config, expected_count=expected_count, num_available=num_healthy_tasks, )
def check_flink_service_health( instance_config: FlinkDeploymentConfig, all_pods: Sequence[V1Pod], smartstack_replication_checker: KubeSmartstackReplicationChecker, ) -> None: si_pods = filter_pods_by_service_instance( pod_list=all_pods, service=instance_config.service, instance=instance_config.instance, ) taskmanagers_expected_cnt = instance_config.config_dict.get( "taskmanager", { "instances": 10 }).get("instances", 10) num_healthy_supervisors = healthy_flink_containers_cnt( si_pods, "supervisor") num_healthy_jobmanagers = healthy_flink_containers_cnt( si_pods, "jobmanager") num_healthy_taskmanagers = healthy_flink_containers_cnt( si_pods, "taskmanager") strerror = None reported_taskmanagers = None try: overview = flink_tools.get_flink_jobmanager_overview( instance_config.service, instance_config.instance, instance_config.cluster) reported_taskmanagers = overview.get("taskmanagers", 0) except ValueError as e: strerror = str(e) send_event_if_not_enough_taskmanagers( instance_config=instance_config, expected_count=taskmanagers_expected_cnt, num_reported=reported_taskmanagers, strerror=strerror, ) monitoring_tools.send_replication_event_if_under_replication( instance_config=instance_config, expected_count=1, num_available=num_healthy_supervisors, sub_component="supervisor", ) monitoring_tools.send_replication_event_if_under_replication( instance_config=instance_config, expected_count=1, num_available=num_healthy_jobmanagers, sub_component="jobmanager", ) monitoring_tools.send_replication_event_if_under_replication( instance_config=instance_config, expected_count=taskmanagers_expected_cnt, num_available=num_healthy_taskmanagers, sub_component="taskmanager", )
def check_flink_service_replication( instance_config: FlinkDeploymentConfig, all_pods: Sequence[V1Pod], smartstack_replication_checker: KubeSmartstackReplicationChecker, ) -> None: si_pods = filter_pods_by_service_instance( pod_list=all_pods, service=instance_config.service, instance=instance_config.instance, ) taskmanagers_expected_cnt = instance_config.config_dict.get( "taskmanager", {"instances": 10} ).get("instances", 10) num_healthy_supervisors = healthy_flink_containers_cnt(si_pods, "supervisor") num_healthy_jobmanagers = healthy_flink_containers_cnt(si_pods, "jobmanager") num_healthy_taskmanagers = healthy_flink_containers_cnt(si_pods, "taskmanager") # TBD: check cnt according to Flink monitoring_tools.send_replication_event_if_under_replication( instance_config=instance_config, expected_count=1, num_available=num_healthy_supervisors, sub_component="supervisor", ) monitoring_tools.send_replication_event_if_under_replication( instance_config=instance_config, expected_count=1, num_available=num_healthy_jobmanagers, sub_component="jobmanager", ) monitoring_tools.send_replication_event_if_under_replication( instance_config=instance_config, expected_count=taskmanagers_expected_cnt, num_available=num_healthy_taskmanagers, sub_component="taskmanager", )