def test_send_replication_event_users_monitoring_tools_send_event_respects_alert_after( instance_config ): fake_status = "999999" fake_output = "YOU DID IT" instance_config.get_monitoring.return_value = {"alert_after": "666m"} expected_check_name = ( "check_paasta_services_replication.%s" % instance_config.job_id ) with mock.patch( "paasta_tools.monitoring_tools.send_event", autospec=True ) as send_event_patch, mock.patch( "paasta_tools.monitoring_tools._log", autospec=True ), mock.patch( "paasta_tools.monitoring_tools.get_runbook", autospec=True, return_value="y/runbook", ): monitoring_tools.send_replication_event( instance_config=instance_config, status=fake_status, output=fake_output ) send_event_patch.call_count == 1 send_event_patch.assert_called_once_with( service=instance_config.service, check_name=expected_check_name, overrides={ "runbook": "y/runbook", "alert_after": "666m", "check_every": "1m", }, status=fake_status, output=fake_output, soa_dir=instance_config.soa_dir, cluster=instance_config.cluster, )
def check_flink_service_health( instance_config: FlinkDeploymentConfig, all_tasks_or_pods: Sequence[V1Pod], replication_checker: KubeSmartstackEnvoyReplicationChecker, dry_run: bool = False, ) -> None: si_pods = filter_pods_by_service_instance( pod_list=all_tasks_or_pods, service=instance_config.service, instance=instance_config.instance, ) taskmanagers_expected_cnt = instance_config.config_dict.get( "taskmanager", {"instances": 10} ).get("instances", 10) num_healthy_supervisors = healthy_flink_containers_cnt(si_pods, "supervisor") num_healthy_jobmanagers = healthy_flink_containers_cnt(si_pods, "jobmanager") num_healthy_taskmanagers = healthy_flink_containers_cnt(si_pods, "taskmanager") service_cr_name = get_cr_name(si_pods) results = [ check_under_replication( instance_config=instance_config, expected_count=1, num_available=num_healthy_supervisors, sub_component="supervisor", ), check_under_replication( instance_config=instance_config, expected_count=1, num_available=num_healthy_jobmanagers, sub_component="jobmanager", ), check_under_replication( instance_config=instance_config, expected_count=taskmanagers_expected_cnt, num_available=num_healthy_taskmanagers, sub_component="taskmanager", ), check_under_registered_taskmanagers( instance_config=instance_config, expected_count=taskmanagers_expected_cnt, cr_name=service_cr_name, ), ] output = ", ".join([r[1] for r in results]) description = "\n########\n".join([r[2] for r in results]) if any(r[0] for r in results): log.error(output) status = pysensu_yelp.Status.CRITICAL else: log.info(output) status = pysensu_yelp.Status.OK send_replication_event( instance_config=instance_config, status=status, output=output, description=description, dry_run=dry_run, )
def test_send_replication_event_users_monitoring_tools_send_event_properly( instance_config, ): fake_status = "999999" fake_output = "YOU DID IT" fake_description = "SOME CONTEXT" instance_config.get_monitoring.return_value = {"fake_key": "fake_value"} expected_check_name = ( "check_paasta_services_replication.%s" % instance_config.job_id ) with mock.patch( "paasta_tools.monitoring_tools.send_event", autospec=True ) as send_event_patch, mock.patch( "paasta_tools.monitoring_tools._log", autospec=True ), mock.patch( "paasta_tools.monitoring_tools.get_runbook", autospec=True, return_value="y/runbook", ): monitoring_tools.send_replication_event( instance_config=instance_config, status=fake_status, output=fake_output, description=fake_description, dry_run=True, ) send_event_patch.assert_called_once_with( service=instance_config.service, check_name=expected_check_name, overrides={ "fake_key": "fake_value", "runbook": mock.ANY, "tip": mock.ANY, "alert_after": "2m", "check_every": "1m", "description": fake_description, }, status=fake_status, output=fake_output, soa_dir=instance_config.soa_dir, cluster=instance_config.cluster, dry_run=True, )
def send_event_if_not_enough_taskmanagers( instance_config: FlinkDeploymentConfig, expected_count: int, num_reported: Optional[int], strerror: Optional[str], ) -> None: under_replicated = False if strerror is None: crit_threshold = instance_config.get_replication_crit_percentage() output = ( "Service %s has %d out of %d expected instances of %s reported by dashboard!\n" + "(threshold: %d%%)") % ( instance_config.job_id, num_reported, expected_count, "taskmanager", crit_threshold, ) under_replicated, _ = is_under_replicated(num_reported, expected_count, crit_threshold) else: output = ("Dashboard of service %s is not available!\n" + "(%s)") % ( instance_config.job_id, strerror, ) if under_replicated or strerror: output += _event_explanation() output += ( " paasta status -s %(service)s -i %(instance)s -c %(cluster)s -vv\n" ) % { "service": instance_config.service, "instance": instance_config.instance, "cluster": instance_config.cluster, } log.error(output) status = pysensu_yelp.Status.CRITICAL else: log.info(output) status = pysensu_yelp.Status.OK send_replication_event(instance_config=instance_config, status=status, output=output)
def test_send_replication_event_users_monitoring_tools_send_event_respects_alert_after( instance_config): fake_status = '999999' fake_output = 'YOU DID IT' instance_config.get_monitoring.return_value = {'alert_after': '666m'} expected_check_name = ('check_paasta_services_replication.%s' % instance_config.job_id) with mock.patch( "paasta_tools.monitoring_tools.send_event", autospec=True, ) as send_event_patch, mock.patch( "paasta_tools.monitoring_tools._log", autospec=True, ), mock.patch( 'paasta_tools.monitoring_tools.get_runbook', autospec=True, return_value='y/runbook', ): monitoring_tools.send_replication_event( instance_config=instance_config, status=fake_status, output=fake_output, ) send_event_patch.call_count == 1 send_event_patch.assert_called_once_with( service=instance_config.service, check_name=expected_check_name, overrides={ 'runbook': 'y/runbook', 'alert_after': '666m', 'check_every': '1m', }, status=fake_status, output=fake_output, soa_dir=instance_config.soa_dir, cluster=instance_config.cluster, )