Exemplo n.º 1
0
def test_send_replication_event_users_monitoring_tools_send_event_respects_alert_after(
    instance_config
):
    fake_status = "999999"
    fake_output = "YOU DID IT"
    instance_config.get_monitoring.return_value = {"alert_after": "666m"}
    expected_check_name = (
        "check_paasta_services_replication.%s" % instance_config.job_id
    )
    with mock.patch(
        "paasta_tools.monitoring_tools.send_event", autospec=True
    ) as send_event_patch, mock.patch(
        "paasta_tools.monitoring_tools._log", autospec=True
    ), mock.patch(
        "paasta_tools.monitoring_tools.get_runbook",
        autospec=True,
        return_value="y/runbook",
    ):
        monitoring_tools.send_replication_event(
            instance_config=instance_config, status=fake_status, output=fake_output
        )
        send_event_patch.call_count == 1
        send_event_patch.assert_called_once_with(
            service=instance_config.service,
            check_name=expected_check_name,
            overrides={
                "runbook": "y/runbook",
                "alert_after": "666m",
                "check_every": "1m",
            },
            status=fake_status,
            output=fake_output,
            soa_dir=instance_config.soa_dir,
            cluster=instance_config.cluster,
        )
Exemplo n.º 2
0
def check_flink_service_health(
    instance_config: FlinkDeploymentConfig,
    all_tasks_or_pods: Sequence[V1Pod],
    replication_checker: KubeSmartstackEnvoyReplicationChecker,
    dry_run: bool = False,
) -> None:
    si_pods = filter_pods_by_service_instance(
        pod_list=all_tasks_or_pods,
        service=instance_config.service,
        instance=instance_config.instance,
    )
    taskmanagers_expected_cnt = instance_config.config_dict.get(
        "taskmanager", {"instances": 10}
    ).get("instances", 10)
    num_healthy_supervisors = healthy_flink_containers_cnt(si_pods, "supervisor")
    num_healthy_jobmanagers = healthy_flink_containers_cnt(si_pods, "jobmanager")
    num_healthy_taskmanagers = healthy_flink_containers_cnt(si_pods, "taskmanager")

    service_cr_name = get_cr_name(si_pods)

    results = [
        check_under_replication(
            instance_config=instance_config,
            expected_count=1,
            num_available=num_healthy_supervisors,
            sub_component="supervisor",
        ),
        check_under_replication(
            instance_config=instance_config,
            expected_count=1,
            num_available=num_healthy_jobmanagers,
            sub_component="jobmanager",
        ),
        check_under_replication(
            instance_config=instance_config,
            expected_count=taskmanagers_expected_cnt,
            num_available=num_healthy_taskmanagers,
            sub_component="taskmanager",
        ),
        check_under_registered_taskmanagers(
            instance_config=instance_config,
            expected_count=taskmanagers_expected_cnt,
            cr_name=service_cr_name,
        ),
    ]
    output = ", ".join([r[1] for r in results])
    description = "\n########\n".join([r[2] for r in results])
    if any(r[0] for r in results):
        log.error(output)
        status = pysensu_yelp.Status.CRITICAL
    else:
        log.info(output)
        status = pysensu_yelp.Status.OK
    send_replication_event(
        instance_config=instance_config,
        status=status,
        output=output,
        description=description,
        dry_run=dry_run,
    )
Exemplo n.º 3
0
def test_send_replication_event_users_monitoring_tools_send_event_properly(
    instance_config,
):
    fake_status = "999999"
    fake_output = "YOU DID IT"
    fake_description = "SOME CONTEXT"
    instance_config.get_monitoring.return_value = {"fake_key": "fake_value"}

    expected_check_name = (
        "check_paasta_services_replication.%s" % instance_config.job_id
    )
    with mock.patch(
        "paasta_tools.monitoring_tools.send_event", autospec=True
    ) as send_event_patch, mock.patch(
        "paasta_tools.monitoring_tools._log", autospec=True
    ), mock.patch(
        "paasta_tools.monitoring_tools.get_runbook",
        autospec=True,
        return_value="y/runbook",
    ):
        monitoring_tools.send_replication_event(
            instance_config=instance_config,
            status=fake_status,
            output=fake_output,
            description=fake_description,
            dry_run=True,
        )
        send_event_patch.assert_called_once_with(
            service=instance_config.service,
            check_name=expected_check_name,
            overrides={
                "fake_key": "fake_value",
                "runbook": mock.ANY,
                "tip": mock.ANY,
                "alert_after": "2m",
                "check_every": "1m",
                "description": fake_description,
            },
            status=fake_status,
            output=fake_output,
            soa_dir=instance_config.soa_dir,
            cluster=instance_config.cluster,
            dry_run=True,
        )
def send_event_if_not_enough_taskmanagers(
    instance_config: FlinkDeploymentConfig,
    expected_count: int,
    num_reported: Optional[int],
    strerror: Optional[str],
) -> None:
    under_replicated = False
    if strerror is None:
        crit_threshold = instance_config.get_replication_crit_percentage()
        output = (
            "Service %s has %d out of %d expected instances of %s reported by dashboard!\n"
            + "(threshold: %d%%)") % (
                instance_config.job_id,
                num_reported,
                expected_count,
                "taskmanager",
                crit_threshold,
            )
        under_replicated, _ = is_under_replicated(num_reported, expected_count,
                                                  crit_threshold)
    else:
        output = ("Dashboard of service %s is not available!\n" + "(%s)") % (
            instance_config.job_id,
            strerror,
        )
    if under_replicated or strerror:
        output += _event_explanation()
        output += (
            "      paasta status -s %(service)s -i %(instance)s -c %(cluster)s -vv\n"
        ) % {
            "service": instance_config.service,
            "instance": instance_config.instance,
            "cluster": instance_config.cluster,
        }
        log.error(output)
        status = pysensu_yelp.Status.CRITICAL
    else:
        log.info(output)
        status = pysensu_yelp.Status.OK
    send_replication_event(instance_config=instance_config,
                           status=status,
                           output=output)
Exemplo n.º 5
0
def test_send_replication_event_users_monitoring_tools_send_event_respects_alert_after(
        instance_config):
    fake_status = '999999'
    fake_output = 'YOU DID IT'
    instance_config.get_monitoring.return_value = {'alert_after': '666m'}
    expected_check_name = ('check_paasta_services_replication.%s' %
                           instance_config.job_id)
    with mock.patch(
            "paasta_tools.monitoring_tools.send_event",
            autospec=True,
    ) as send_event_patch, mock.patch(
            "paasta_tools.monitoring_tools._log",
            autospec=True,
    ), mock.patch(
            'paasta_tools.monitoring_tools.get_runbook',
            autospec=True,
            return_value='y/runbook',
    ):
        monitoring_tools.send_replication_event(
            instance_config=instance_config,
            status=fake_status,
            output=fake_output,
        )
        send_event_patch.call_count == 1
        send_event_patch.assert_called_once_with(
            service=instance_config.service,
            check_name=expected_check_name,
            overrides={
                'runbook': 'y/runbook',
                'alert_after': '666m',
                'check_every': '1m',
            },
            status=fake_status,
            output=fake_output,
            soa_dir=instance_config.soa_dir,
            cluster=instance_config.cluster,
        )