Exemplo n.º 1
0
    def test_should_generate_pending_count_with_alerts_graph(self):
        name = "service-1"
        cloudwatch_data_source = "prod"
        cluster_name = "cluster-1"
        grid_pos = GridPos(1, 2, 3, 4)
        notifications = ["foo", "bar"]

        panel = generate_pending_count_graph(
            name=name,
            cloudwatch_data_source=cloudwatch_data_source,
            cluster_name=cluster_name,
            grid_pos=grid_pos,
            notifications=notifications,
        )

        panel.alert.should.be.a(Alert)
        panel.alert.gracePeriod.should.eql("15m")
        panel.alert.alertConditions.should.have.length_of(1)
        panel.alert.alertConditions[0].should.eql(
            AlertCondition(
                Target(refId="A"),
                timeRange=TimeRange("5m", "now"),
                evaluator=GreaterThan(0),
                reducerType=RTYPE_MAX,
                operator=OP_AND,
            )
        )
Exemplo n.º 2
0
    def test_should_generate_mem_utilization_percentage_with_alerts_graph(self):
        name = "service-1"
        cloudwatch_data_source = "prod"
        cluster_name = "cluster-1"
        grid_pos = GridPos(1, 2, 3, 4)
        notifications = ["foo", "bar", "baz"]

        expected_alert_condition = AlertCondition(
            Target(refId="A"),
            timeRange=TimeRange("15m", "now"),
            evaluator=GreaterThan(85),
            reducerType=RTYPE_MAX,
            operator=OP_AND,
        )

        panel = generate_mem_utilization_percentage_graph(
            name=name,
            cloudwatch_data_source=cloudwatch_data_source,
            cluster_name=cluster_name,
            grid_pos=grid_pos,
            notifications=notifications,
        )

        panel.alert.should.be.a(Alert)
        panel.alert.alertConditions.should.have.length_of(1)
        panel.alert.alertConditions[0].should.eql(expected_alert_condition)
        panel.alert.notifications.should.eql(notifications)
Exemplo n.º 3
0
    def test_should_generate_res_count_graph_with_alert(self):
        name = "service-1"
        cloudwatch_data_source = "prod"
        loadbalancer = "loadbalancer-1"
        target_group = "target-group-1"
        grid_pos = GridPos(1, 2, 3, 4)
        notifications = ["foo", "bar", "baz"]

        panel = generate_res_count_graph(
            name=name,
            cloudwatch_data_source=cloudwatch_data_source,
            grid_pos=grid_pos,
            loadbalancer=loadbalancer,
            target_group=target_group,
            notifications=notifications,
        )
        panel.alert.should.be.a(Alert)
        panel.alert.message.should.eql("{} has 5XX errors".format(name))
        panel.alert.alertConditions.should.have.length_of(1)
        panel.alert.alertConditions.should.eql(
            [
                AlertCondition(
                    Target(refId="A"),
                    timeRange=TimeRange("15m", "now"),
                    evaluator=GreaterThan(0),
                    reducerType=RTYPE_MAX,
                    operator=OP_AND,
                ),
            ]
        )
Exemplo n.º 4
0
def number_of_active_processes_graph(grr_component):
    return Graph(
        title="Number of Active Processes",
        targets=[
            Target(
                expr='sum(up{{job="grr_{}"}})'.format(grr_component),
                legendFormat="Active Processes",
            ),
        ],
        alert=Alert(
            name="Number of Active Processes alert",
            message="The number of active {} processes is below {}".format(
                grr_component.capitalize(),
                config.ACTIVE_PROCESSES_ALERTING_CONDITION),
            alertConditions=[
                AlertCondition(Target(
                    expr='sum(up{{job="grr_{}"}})'.format(grr_component),
                    legendFormat="Active Processes",
                ),
                               timeRange=TimeRange("10s", "now"),
                               evaluator=LowerThan(
                                   config.ACTIVE_PROCESSES_ALERTING_CONDITION),
                               operator=OP_AND,
                               reducerType=RTYPE_SUM)
            ],
        ))
Exemplo n.º 5
0
def generate_elasticsearch_status_red_alert_graph(
        name: str, client_id: str, cloudwatch_data_source: str,
        notifications: List[str]) -> Graph:
    """
    Generate Elasticsearch graph
    """

    y_axes = YAxes(
        YAxis(format=SHORT_FORMAT),
        YAxis(format=SHORT_FORMAT),
    )

    targets = [
        CloudwatchMetricsTarget(
            alias="Red status",
            namespace=NAMESPACE,
            period="1m",
            statistics=["Maximum"],
            dimensions={
                "DomainName": name,
                "ClientId": client_id
            },
            metricName="ClusterStatus.red",
        ),
    ]

    alert = None

    if notifications:
        alert = Alert(
            name="Elasticsearch is in status red",
            message="Elasticsearch is in status red",
            executionErrorState="alerting",
            alertConditions=[
                AlertCondition(
                    Target(refId=ALERT_REF_ID),
                    timeRange=TimeRange("5m", "now"),
                    evaluator=GreaterThan(0),
                    reducerType=RTYPE_MAX,
                    operator=OP_OR,
                ),
            ],
            frequency="2m",
            gracePeriod="2m",
            notifications=notifications,
        )

    return Graph(
        title="Status RED alerts",
        dataSource=cloudwatch_data_source,
        targets=targets,
        yAxes=y_axes,
        transparent=TRANSPARENT,
        editable=EDITABLE,
        bars=True,
        lines=False,
        alert=alert,
    ).auto_ref_ids()
Exemplo n.º 6
0
def generate_desired_count_graph(
    name: str,
    cluster_name: str,
    max: int,
    cloudwatch_data_source: str,
    notifications: List[str],
    grid_pos: GridPos,
):
    targets = [
        CloudwatchMetricsTarget(
            alias="Containers",
            namespace=CONTAINER_INSIGHTS_NAMESPACE,
            statistics=["Maximum"],
            metricName="DesiredTaskCount",
            dimensions={
                "ServiceName": name,
                "ClusterName": cluster_name
            },
            refId=ALERT_REF_ID,
        ),
    ]

    alert = None
    if notifications and max > 1:
        alert = Alert(
            name="{} Desired count of containers nearing the max".format(name),
            message="{} is having Desired count of containers nearing the max".
            format(name),
            executionErrorState="alerting",
            alertConditions=[
                AlertCondition(
                    Target(refId=ALERT_REF_ID),
                    timeRange=TimeRange("15m", "now"),
                    evaluator=GreaterThan(0.9 * max),  # 90% of max
                    reducerType=RTYPE_MAX,
                    operator=OP_AND,
                )
            ],
            gracePeriod="1m",
            notifications=notifications,
        )

    return Graph(
        title="Desired Tasks",
        dataSource=cloudwatch_data_source,
        targets=targets,
        transparent=TRANSPARENT,
        editable=EDITABLE,
        alert=alert,
        gridPos=grid_pos,
        alertThreshold=ALERT_THRESHOLD,
    ).auto_ref_ids()
Exemplo n.º 7
0
def generate_rds_transaction_id_graph(name: str, cloudwatch_data_source: str,
                                      notifications: List[str]):
    """
    Generate rds graph
    """

    y_axes = single_y_axis(format=SHORT_FORMAT)

    targets = [
        CloudwatchMetricsTarget(
            alias="Transaction ids used",
            metricName="MaximumUsedTransactionIDs",
            statistics=["Maximum"],
            namespace=NAMESPACE,
            dimensions={"DBInstanceIdentifier": name},
            period="1m",
            refId=ALERT_REF_ID,
        ),
    ]

    alert = None

    if notifications:
        alert = Alert(
            name="{} transaction ids used Errors".format(name),
            message="{} is having transaction ids used errors".format(name),
            executionErrorState="alerting",
            alertConditions=[
                AlertCondition(
                    Target(refId=ALERT_REF_ID),
                    timeRange=TimeRange("5m", "now"),
                    evaluator=GreaterThan(1000000000),
                    reducerType=RTYPE_MAX,
                    operator=OP_AND,
                )
            ],
            gracePeriod="2m",
            frequency="2m",
            notifications=notifications,
        )

    return Graph(
        title="Transaction ids used",
        dataSource=cloudwatch_data_source,
        targets=targets,
        yAxes=y_axes,
        transparent=TRANSPARENT,
        editable=EDITABLE,
        bars=False,
        lines=True,
        alert=alert,
    ).auto_ref_ids()
Exemplo n.º 8
0
def create_lambda_sqs_dlq_graph(name: str, cloudwatch_data_source: str,
                                fifo: bool, notifications: List[str]):
    """Create SQS Deadletter graph"""

    if fifo:
        name += ".fifo"

    targets = [
        CloudwatchMetricsTarget(
            alias="Approximate number of messages available",
            namespace="AWS/SQS",
            statistics=["Maximum"],
            metricName="ApproximateNumberOfMessagesVisible",
            dimensions={"QueueName": name},
            refId=ALERT_REF_ID if notifications else None,
        )
    ]

    yAxes = single_y_axis(format=SHORT_FORMAT)
    alert = None

    # https://docs.aws.amazon.com/AWSSimpleQueueService/latest/SQSDeveloperGuide/sqs-monitoring-using-cloudwatch.html
    # https://aws.amazon.com/about-aws/whats-new/2019/12/amazon-sqs-now-supports-1-minute-cloudwatch-metrics/
    if notifications:
        alert = Alert(
            name="{} messages".format(name),
            message="{} is having messages".format(name),
            executionErrorState="alerting",
            alertConditions=[
                AlertCondition(
                    Target(refId=ALERT_REF_ID),
                    timeRange=TimeRange("5m", "now"),
                    evaluator=GreaterThan(0),
                    reducerType=RTYPE_MAX,
                    operator=OP_AND,
                ),
            ],
            gracePeriod="5m",
            notifications=notifications,
        )

    return Graph(
        title="SQS Dead Letter Queue: {}".format(name),
        dataSource=cloudwatch_data_source,
        targets=targets,
        yAxes=yAxes,
        transparent=TRANSPARENT,
        editable=EDITABLE,
        alert=alert,
        alertThreshold=ALERT_THRESHOLD,
    ).auto_ref_ids()
Exemplo n.º 9
0
def generate_elasticsearch_storage_graph(name: str, client_id: str,
                                         cloudwatch_data_source: str,
                                         notifications: List[str]) -> Graph:
    """
    Generate Elasticsearch graph
    """

    y_axes = YAxes(
        YAxis(format=MEGA_BYTES),
        YAxis(format=MEGA_BYTES),
    )
    free_storage_alias = "Free storage"
    cluster_used_space_alias = "Used space"

    targets = [
        CloudwatchMetricsTarget(
            alias=free_storage_alias,
            namespace=NAMESPACE,
            period="1m",
            statistics=["Minimum"],
            dimensions={
                "DomainName": name,
                "ClientId": client_id
            },
            metricName="FreeStorageSpace",
            refId=ALERT_REF_ID,
        ),
        CloudwatchMetricsTarget(
            alias=cluster_used_space_alias,
            namespace=NAMESPACE,
            period="1m",
            statistics=["Maximum"],
            dimensions={
                "DomainName": name,
                "ClientId": client_id
            },
            metricName="ClusterUsedSpace",
        ),
    ]

    alert = None
    if notifications:
        alert = Alert(
            name="Elasticsearch storage alert",
            message="Elasticsearch might be low on storage",
            executionErrorState="alerting",
            alertConditions=[
                AlertCondition(
                    Target(refId=ALERT_REF_ID),
                    timeRange=TimeRange("5m", "now"),
                    evaluator=LowerThan(10240),
                    reducerType=RTYPE_MAX,
                    operator=OP_OR,
                ),
            ],
            frequency="2m",
            gracePeriod="2m",
            notifications=notifications,
        )

    series_overrides = [
        {
            "alias": free_storage_alias,
            "color": colors.GREEN,
            "lines": True,
            "bars": False,
        },
        {
            "alias": cluster_used_space_alias,
            "color": colors.ORANGE,
            "lines": True,
            "bars": False,
            "yaxis": 2,
        },
    ]

    return Graph(
        title="Storage",
        dataSource=cloudwatch_data_source,
        targets=targets,
        yAxes=y_axes,
        seriesOverrides=series_overrides,
        transparent=TRANSPARENT,
        editable=EDITABLE,
        bars=True,
        lines=False,
        alert=alert,
    ).auto_ref_ids()
Exemplo n.º 10
0
def generate_res_count_graph(
    name: str,
    cloudwatch_data_source: str,
    loadbalancer: str,
    target_group: str,
    grid_pos: GridPos,
    notifications: List[str],
) -> Graph:
    """
    Generate res graph
    """

    xx2_alias = "2xx"
    xx3_alias = "3xx"
    xx4_alias = "4xx"
    xx5_alias = "5xx"

    targets = [
        CloudwatchMetricsTarget(
            alias=xx2_alias,
            namespace="AWS/ApplicationELB",
            statistics=["Sum"],
            metricName="HTTPCode_Target_2XX_Count",
            dimensions={
                "LoadBalancer": loadbalancer,
                "TargetGroup": target_group
            },
        ),
        CloudwatchMetricsTarget(
            alias=xx3_alias,
            namespace="AWS/ApplicationELB",
            statistics=["Sum"],
            metricName="HTTPCode_Target_3XX_Count",
            dimensions={
                "LoadBalancer": loadbalancer,
                "TargetGroup": target_group
            },
        ),
        CloudwatchMetricsTarget(
            alias=xx4_alias,
            namespace="AWS/ApplicationELB",
            statistics=["Sum"],
            metricName="HTTPCode_Target_4XX_Count",
            dimensions={
                "LoadBalancer": loadbalancer,
                "TargetGroup": target_group
            },
        ),
        CloudwatchMetricsTarget(
            alias=xx5_alias,
            namespace="AWS/ApplicationELB",
            statistics=["Sum"],
            metricName="HTTPCode_Target_5XX_Count",
            dimensions={
                "LoadBalancer": loadbalancer,
                "TargetGroup": target_group
            },
            refId=ALERT_REF_ID,
        ),
    ]

    seriesOverrides = [
        {
            "alias": xx2_alias,
            "color": colors.GREEN,
            "fill": 0
        },
        {
            "alias": xx3_alias,
            "color": colors.YELLOW,
            "fill": 0
        },
        {
            "alias": xx4_alias,
            "color": colors.ORANGE,
            "fill": 0
        },
        {
            "alias": xx5_alias,
            "color": colors.RED,
            "fill": 0
        },
    ]

    alert = None
    if notifications:
        alert = Alert(
            name="{} has 5XX errors".format(name),
            message="{} has 5XX errors".format(name),
            executionErrorState="alerting",
            alertConditions=[
                AlertCondition(
                    Target(refId=ALERT_REF_ID),
                    timeRange=TimeRange("15m", "now"),
                    evaluator=GreaterThan(0),
                    reducerType=RTYPE_MAX,
                    operator=OP_AND,
                )
            ],
            gracePeriod="1m",
            notifications=notifications,
        )

    return Graph(
        title="Responses",
        dataSource=cloudwatch_data_source,
        targets=targets,
        seriesOverrides=seriesOverrides,
        transparent=TRANSPARENT,
        editable=EDITABLE,
        gridPos=grid_pos,
        alert=alert,
    )
Exemplo n.º 11
0
def generate_mem_utilization_percentage_graph(
    name: str,
    cloudwatch_data_source: str,
    cluster_name: str,
    notifications: List[str],
    grid_pos: GridPos,
) -> Graph:
    """
    Generate Mem Percentage graph
    """

    y_axes = single_y_axis(format=PERCENT_FORMAT)

    targets = [
        CloudwatchMetricsTarget(
            alias=MINIMUM_ALIAS,
            namespace=ECS_NAMESPACE,
            statistics=["Minimum"],
            metricName="MemoryUtilization",
            dimensions={
                "ServiceName": name,
                "ClusterName": cluster_name
            },
        ),
        CloudwatchMetricsTarget(
            alias=AVERAGE_ALIAS,
            namespace=ECS_NAMESPACE,
            statistics=["Average"],
            metricName="MemoryUtilization",
            dimensions={
                "ServiceName": name,
                "ClusterName": cluster_name
            },
            refId=ALERT_REF_ID,
        ),
        CloudwatchMetricsTarget(
            alias=MAXIMUM_ALIAS,
            namespace=ECS_NAMESPACE,
            statistics=["Maximum"],
            metricName="MemoryUtilization",
            dimensions={
                "ServiceName": name,
                "ClusterName": cluster_name
            },
        ),
    ]

    seriesOverrides = [
        {
            "alias": MINIMUM_ALIAS,
            "color": colors.GREEN,
            "lines": False
        },
        {
            "alias": AVERAGE_ALIAS,
            "color": colors.YELLOW,
            "fill": 0
        },
        {
            "alias": MAXIMUM_ALIAS,
            "color": colors.GREEN,
            "fillBelowTo": MINIMUM_ALIAS,
            "lines": False,
        },
    ]

    alert = None
    if notifications:
        alert = Alert(
            name="{} Memory utilization Errors".format(name),
            message="{} is having Memory utilization errors".format(name),
            executionErrorState="alerting",
            alertConditions=[
                AlertCondition(
                    Target(refId=ALERT_REF_ID),
                    timeRange=TimeRange("15m", "now"),
                    evaluator=GreaterThan(85),
                    reducerType=RTYPE_MAX,
                    operator=OP_AND,
                )
            ],
            gracePeriod="1m",
            notifications=notifications,
        )

    return Graph(
        title="Memory Utilization Percentage",
        dataSource=cloudwatch_data_source,
        targets=targets,
        yAxes=y_axes,
        seriesOverrides=seriesOverrides,
        transparent=TRANSPARENT,
        editable=EDITABLE,
        alert=alert,
        gridPos=grid_pos,
        alertThreshold=ALERT_THRESHOLD,
    ).auto_ref_ids()
Exemplo n.º 12
0
def generate_sfn_execution_metrics_graph(name: str,
                                         cloudwatch_data_source: str,
                                         notifications: List[str], *args,
                                         **kwargs):
    """
    Generate step function graph
    """

    targets = [
        CloudwatchMetricsTarget(
            alias=SFN_EXECUTIONS_STARTED_ALIAS,
            namespace=NAMESPACE,
            metricName="ExecutionsStarted",
            statistics=["Sum"],
            dimensions={"StateMachineArn": name},
        ),
        CloudwatchMetricsTarget(
            alias=SFN_EXECUTIONS_SUCCEEDED_ALIAS,
            namespace=NAMESPACE,
            metricName="ExecutionsSucceeded",
            statistics=["Sum"],
            dimensions={"StateMachineArn": name},
        ),
        CloudwatchMetricsTarget(
            alias=SFN_EXECUTIONS_ABORTED_ALIAS,
            namespace=NAMESPACE,
            metricName="ExecutionsAborted",
            statistics=["Sum"],
            dimensions={"StateMachineArn": name},
            refId=SFN_EXECUTIONS_ABORTED_REF_ID,
        ),
        CloudwatchMetricsTarget(
            alias=SFN_EXECUTIONS_FAILED_ALIAS,
            namespace=NAMESPACE,
            metricName="ExecutionsFailed",
            statistics=["Sum"],
            dimensions={"StateMachineArn": name},
            refId=SFN_EXECUTIONS_FAILED_REF_ID,
        ),
        CloudwatchMetricsTarget(
            alias=SFN_EXECUTIONS_THROTTLED_ALIAS,
            namespace=NAMESPACE,
            metricName="ExecutionsThrottled",
            statistics=["Sum"],
            dimensions={"StateMachineArn": name},
            refId=SFN_EXECUTIONS_THROTTLED_REF_ID,
        ),
        CloudwatchMetricsTarget(
            alias=SFN_EXECUTIONS_TIMEDOUT_ALIAS,
            namespace=NAMESPACE,
            metricName="ExecutionsTimedOut",
            statistics=["Sum"],
            dimensions={"StateMachineArn": name},
            refId=SFN_EXECUTIONS_TIMEDOUT_REF_ID,
        ),
    ]

    yAxes = YAxes(
        YAxis(format=SHORT_FORMAT, decimals=2),
        YAxis(format=SHORT_FORMAT, decimals=2),
    )

    seriesOverrides = [
        {
            "alias": SFN_EXECUTIONS_STARTED_ALIAS,
            "points": False,
            "color": colors.BLUE,
        },
        {
            "alias": SFN_EXECUTIONS_SUCCEEDED_ALIAS,
            "points": False,
            "color": colors.GREEN,
        },
        {
            "alias": SFN_EXECUTIONS_ABORTED_ALIAS,
            "points": False,
            "color": colors.RED,
        },
        {
            "alias": SFN_EXECUTIONS_FAILED_ALIAS,
            "points": False,
            "color": colors.RED,
        },
        {
            "alias": SFN_EXECUTIONS_THROTTLED_ALIAS,
            "points": False,
            "color": colors.ORANGE,
        },
        {
            "alias": SFN_EXECUTIONS_TIMEDOUT_ALIAS,
            "points": False,
            "color": colors.RED,
        },
    ]

    alert = None

    # https://docs.aws.amazon.com/lambda/latest/dg/monitoring-metrics.html
    if notifications:
        alert = Alert(
            name="{} execution issues".format(name),
            message="{} might have failed, aborted, throttled or timedout".
            format(name),
            executionErrorState="alerting",
            alertConditions=[
                AlertCondition(
                    Target(refId=SFN_EXECUTIONS_ABORTED_REF_ID),
                    timeRange=TimeRange("5m", "now"),
                    evaluator=GreaterThan(0),
                    reducerType=RTYPE_MAX,
                    operator=OP_OR,
                ),
                AlertCondition(
                    Target(refId=SFN_EXECUTIONS_FAILED_REF_ID),
                    timeRange=TimeRange("5m", "now"),
                    evaluator=GreaterThan(0),
                    reducerType=RTYPE_MAX,
                    operator=OP_OR,
                ),
                AlertCondition(
                    Target(refId=SFN_EXECUTIONS_THROTTLED_REF_ID),
                    timeRange=TimeRange("5m", "now"),
                    evaluator=GreaterThan(0),
                    reducerType=RTYPE_MAX,
                    operator=OP_OR,
                ),
                AlertCondition(
                    Target(refId=SFN_EXECUTIONS_TIMEDOUT_REF_ID),
                    timeRange=TimeRange("5m", "now"),
                    evaluator=GreaterThan(0),
                    reducerType=RTYPE_MAX,
                    operator=OP_OR,
                ),
            ],
            gracePeriod="2m",
            frequency="2m",
            notifications=notifications,
        )

    return Graph(
        title="Step function execution metrics",
        dataSource=cloudwatch_data_source,
        targets=targets,
        seriesOverrides=seriesOverrides,
        yAxes=yAxes,
        transparent=TRANSPARENT,
        editable=EDITABLE,
        alert=alert,
        alertThreshold=ALERT_THRESHOLD,
    ).auto_ref_ids()
Exemplo n.º 13
0
def generate_elasticsearch_jvm_memory_pressure_graph(
        name: str, client_id: str, cloudwatch_data_source: str,
        notifications: List[str]) -> Graph:
    """
    Generate Elasticsearch graph
    """

    y_axes = single_y_axis(format=PERCENT_FORMAT)
    alias = "JVM memory pressure"

    targets = [
        CloudwatchMetricsTarget(
            alias=alias,
            namespace=NAMESPACE,
            period="1m",
            statistics=["Maximum"],
            dimensions={
                "DomainName": name,
                "ClientId": client_id
            },
            metricName="JVMMemoryPressure",
            refId=ALERT_REF_ID,
        )
    ]

    alert = None
    if notifications:
        alert = Alert(
            name="Elasticsearch JVM memory pressure alert",
            message="Elasticsearch JVM memory pressure alert",
            executionErrorState="alerting",
            alertConditions=[
                AlertCondition(
                    Target(refId=ALERT_REF_ID),
                    timeRange=TimeRange("5m", "now"),
                    evaluator=GreaterThan(80),
                    reducerType=RTYPE_MAX,
                    operator=OP_OR,
                ),
            ],
            frequency="2m",
            gracePeriod="2m",
            notifications=notifications,
        )

    series_overrides = [{
        "alias": alias,
        "color": colors.GREEN,
        "lines": True,
        "bars": False,
    }]

    return Graph(
        title=alias,
        dataSource=cloudwatch_data_source,
        targets=targets,
        yAxes=y_axes,
        seriesOverrides=series_overrides,
        transparent=TRANSPARENT,
        editable=EDITABLE,
        bars=True,
        lines=False,
        alert=alert,
    ).auto_ref_ids()
Exemplo n.º 14
0
         Target(
             expr='sum(up{job="fleetspeak"})',
             legendFormat="Active Processes",
         ),
     ],
     alert=Alert(
         name="Number of Active Processes alert",
         message=
         "The number of active Fleetspeak Server processes is below {}"
         .format(ACTIVE_PROCESSES_ALERTING_CONDITION),
         alertConditions=[
             AlertCondition(
                 Target(
                     expr='sum(up{job="fleetspeak"})',
                     legendFormat="Active Processes",
                 ),
                 timeRange=TimeRange("10s", "now"),
                 evaluator=LowerThan(
                     ACTIVE_PROCESSES_ALERTING_CONDITION),
                 operator=OP_AND,
                 reducerType=RTYPE_SUM)
         ],
     )),
 Graph(
     title="Sum of Process Memory Bytes (across all instances)",
     targets=[
         Target(
             expr=
             'sum(process_resident_memory_bytes{job="fleetspeak"})',
             legendFormat="Resident Memory",
         ),
     ]),
Exemplo n.º 15
0
def generate_elasticache_redis_cpu_credit_usage_graph(
        cache_cluster_id: str, cloudwatch_data_source: str,
        notifications: List[str]) -> Graph:
    """
    Generate ElastiCache Redis graph
    """

    y_axes = single_y_axis(format=SHORT_FORMAT)
    aliases = {
        "credit balance": "CPU credit balance",
        "credit usage": "CPU credit usage",
    }

    targets = [
        CloudwatchMetricsTarget(
            alias=aliases["credit balance"],
            namespace=NAMESPACE,
            period="1m",
            statistics=["Minimum"],
            dimensions={"CacheClusterId": cache_cluster_id},
            metricName="CPUCreditBalance",
            refId=ALERT_REF_ID,
        ),
        CloudwatchMetricsTarget(
            alias=aliases["credit usage"],
            namespace=NAMESPACE,
            period="1m",
            statistics=["Maximum"],
            dimensions={"CacheClusterId": cache_cluster_id},
            metricName="CPUCreditUsage",
        ),
    ]

    alert = None
    if notifications:
        alert = Alert(
            name="ElastiCache Redis CPU credit balance alert",
            message="ElastiCache Redis CPU credit balance alert",
            executionErrorState="alerting",
            alertConditions=[
                AlertCondition(
                    Target(refId=ALERT_REF_ID),
                    timeRange=TimeRange("5m", "now"),
                    evaluator=LowerThan(250),
                    reducerType=RTYPE_MAX,
                    operator=OP_OR,
                ),
            ],
            frequency="2m",
            gracePeriod="2m",
            notifications=notifications,
        )

    series_overrides = [
        {
            "alias": aliases["credit balance"],
            "color": colors.GREEN,
            "lines": True,
            "bars": False,
        },
        {
            "alias": aliases["credit usage"],
            "color": colors.YELLOW,
            "lines": True,
            "bars": False,
        },
    ]

    return Graph(
        title="CPU credit utilization",
        dataSource=cloudwatch_data_source,
        targets=targets,
        yAxes=y_axes,
        seriesOverrides=series_overrides,
        transparent=TRANSPARENT,
        editable=EDITABLE,
        bars=True,
        lines=False,
        alert=alert,
    ).auto_ref_ids()
Exemplo n.º 16
0
def generate_rds_cpu_graph(name: str, cloudwatch_data_source: str,
                           notifications: List[str]):
    """
    Generate rds graph
    """

    y_axes = single_y_axis(format=PERCENT_FORMAT)
    min_alias = "min"
    max_alias = "max"
    mean_alias = "mean"

    targets = [
        CloudwatchMetricsTarget(
            alias=max_alias,
            namespace=NAMESPACE,
            dimensions={"DBInstanceIdentifier": name},
            period="1m",
            statistics=["Maximum"],
            metricName="CPUUtilization",
            refId=ALERT_REF_ID,
        ),
        CloudwatchMetricsTarget(
            alias=mean_alias,
            namespace=NAMESPACE,
            dimensions={"DBInstanceIdentifier": name},
            statistics=["Average"],
            metricName="CPUUtilization",
            period="1m",
        ),
        CloudwatchMetricsTarget(
            alias=min_alias,
            namespace=NAMESPACE,
            dimensions={"DBInstanceIdentifier": name},
            statistics=["Minimum"],
            metricName="CPUUtilization",
            period="1m",
        ),
    ]

    series_overrides = get_series_overrides(min_alias, mean_alias, max_alias)

    alert = None

    if notifications:
        alert = Alert(
            name="{} CPU utilization Errors".format(name),
            message="{} is having CPU utilization errors".format(name),
            executionErrorState="alerting",
            alertConditions=[
                AlertCondition(
                    Target(refId=ALERT_REF_ID),
                    timeRange=TimeRange("5m", "now"),
                    evaluator=GreaterThan(80),
                    reducerType=RTYPE_MAX,
                    operator=OP_AND,
                )
            ],
            gracePeriod="2m",
            frequency="2m",
            notifications=notifications,
        )

    return Graph(
        title="CPU utilization",
        dataSource=cloudwatch_data_source,
        targets=targets,
        yAxes=y_axes,
        seriesOverrides=series_overrides,
        transparent=TRANSPARENT,
        editable=EDITABLE,
        bars=False,
        lines=True,
        alert=alert,
    ).auto_ref_ids()
Exemplo n.º 17
0
def lambda_generate_memory_utilization_percentage_graph(
    name: str,
    cloudwatch_data_source: str,
    lambda_insights_namespace: str,
    notifications: List[str],
    *args,
    **kwargs,
) -> Graph:
    """
    Generate lambda graph
    """

    targets = [
        CloudwatchMetricsTarget(
            alias=MINIMUM_ALIAS,
            namespace=lambda_insights_namespace,
            statistics=["Minimum"],
            metricName="memory_utilization",
            dimensions={"function_name": name},
        ),
        CloudwatchMetricsTarget(
            alias=AVERAGE_ALIAS,
            namespace=lambda_insights_namespace,
            statistics=["Average"],
            metricName="memory_utilization",
            dimensions={"function_name": name},
            refId=ALERT_REF_ID,
        ),
        CloudwatchMetricsTarget(
            alias=MAXIMUM_ALIAS,
            namespace=lambda_insights_namespace,
            statistics=["Maximum"],
            metricName="memory_utilization",
            dimensions={"function_name": name},
        ),
    ]

    yAxes = YAxes(
        YAxis(format=SHORT_FORMAT, decimals=2),
        YAxis(format=SHORT_FORMAT, decimals=2),
    )

    seriesOverrides = [
        {
            "alias": MINIMUM_ALIAS,
            "color": "#C8F2C2",
            "lines": False
        },
        {
            "alias": AVERAGE_ALIAS,
            "color": "#FADE2A",
            "fill": 0
        },
        {
            "alias": MAXIMUM_ALIAS,
            "color": "rgb(77, 159, 179)",
            "fillBelowTo": MINIMUM_ALIAS,
            "lines": False,
        },
    ]

    alert = None

    # https://docs.aws.amazon.com/lambda/latest/dg/monitoring-metrics.html
    if notifications:
        alert = Alert(
            name="{} Memory utilization Errors".format(name),
            message="{} is having Memory utilization errors".format(name),
            executionErrorState="alerting",
            alertConditions=[
                AlertCondition(
                    Target(refId=ALERT_REF_ID),
                    timeRange=TimeRange("5m", "now"),
                    evaluator=GreaterThan(90),
                    reducerType=RTYPE_MAX,
                    operator=OP_AND,
                )
            ],
            gracePeriod="1m",
            notifications=notifications,
        )

    return Graph(
        title="Lambda Memory Utilization Percentage",
        dataSource=cloudwatch_data_source,
        targets=targets,
        seriesOverrides=seriesOverrides,
        yAxes=yAxes,
        transparent=TRANSPARENT,
        editable=EDITABLE,
        alert=alert,
        alertThreshold=ALERT_THRESHOLD,
        # gridPos=GridPos(8,12,0,0)
    ).auto_ref_ids()
Exemplo n.º 18
0
def lambda_generate_invocations_graph(name: str, cloudwatch_data_source: str,
                                      notifications: List[str], *args,
                                      **kwargs) -> Graph:
    """
    Generate lambda graph
    """

    targets = [
        CloudwatchMetricsTarget(
            alias=LAMBDA_INVOCATIONS_ALIAS,
            namespace=NAMESPACE,
            statistics=["Sum"],
            metricName="Invocations",
            dimensions={"FunctionName": name},
        ),
        CloudwatchMetricsTarget(
            alias=LAMBDA_ERRORS_ALIAS,
            namespace=NAMESPACE,
            statistics=["Sum"],
            metricName="Errors",
            dimensions={"FunctionName": name},
            refId=ALERT_REF_ID,
        ),
    ]

    yAxes = YAxes(
        YAxis(format=SHORT_FORMAT, decimals=2),
        YAxis(format=SHORT_FORMAT, decimals=2),
    )

    seriesOverrides = [
        {
            "alias": LAMBDA_INVOCATIONS_ALIAS,
            "points": False,
            "color": colors.GREEN,
        },
        {
            "alias": LAMBDA_ERRORS_ALIAS,
            "points": False,
            "color": colors.RED,
        },
    ]

    alert = None

    # https://docs.aws.amazon.com/lambda/latest/dg/monitoring-metrics.html
    if notifications:
        alert = Alert(
            name="{} Invocation Errors".format(name),
            message="{} is having invocation errors".format(name),
            executionErrorState="alerting",
            alertConditions=[
                AlertCondition(
                    Target(refId=ALERT_REF_ID),
                    timeRange=TimeRange("5m", "now"),
                    evaluator=GreaterThan(0),
                    reducerType=RTYPE_MAX,
                    operator=OP_AND,
                )
            ],
            gracePeriod="1m",
            notifications=notifications,
        )

    return Graph(
        title="Lambda Invocations and Errors",
        dataSource=cloudwatch_data_source,
        targets=targets,
        seriesOverrides=seriesOverrides,
        yAxes=yAxes,
        transparent=TRANSPARENT,
        editable=EDITABLE,
        alert=alert,
        alertThreshold=ALERT_THRESHOLD,
        # gridPos=GridPos(8,12,0,0)
    ).auto_ref_ids()
Exemplo n.º 19
0
def generate_api_gateway_requests_graph(name: str, cloudwatch_data_source: str,
                                        notifications: List[str], *args,
                                        **kwargs):
    targets = [
        CloudwatchMetricsTarget(
            alias=API_GATEWAY_5XX_ALIAS,
            namespace=NAMESPACE,
            statistics=["Sum"],
            metricName="5XXError",
            dimensions={"ApiName": name},
            refId=ALERT_REF_ID,
        ),
        CloudwatchMetricsTarget(
            alias=API_GATEWAY_REQUESTS_ALIAS,
            namespace=NAMESPACE,
            statistics=["Sum"],
            metricName="Count",
            dimensions={"ApiName": name},
            refId=API_GATEWAY_REQUESTS_REF_ID,
        ),
        CloudwatchMetricsTarget(
            alias=API_GATEWAY_4XX_ALIAS,
            namespace=NAMESPACE,
            statistics=["Sum"],
            metricName="4XXError",
            dimensions={"ApiName": name},
            refId=API_GATEWAY_4XX_REF_ID,
        ),
    ]

    yAxes = YAxes(
        YAxis(format=SHORT_FORMAT),
        YAxis(format=SHORT_FORMAT),
    )

    seriesOverrides = [
        {
            "alias": API_GATEWAY_REQUESTS_ALIAS,
            "points": False,
            "color": colors.GREEN,
        },
        {
            "alias": API_GATEWAY_4XX_ALIAS,
            "color": colors.YELLOW,
        },
        {
            "alias": API_GATEWAY_5XX_ALIAS,
            "color": colors.RED,
        },
    ]

    alert = None

    # https://docs.aws.amazon.com/lambda/latest/dg/monitoring-metrics.html
    if notifications:
        alert = Alert(
            name="{} API Gateway 5XX Errors".format(name),
            message="{} is having 5XX errors".format(name),
            executionErrorState="alerting",
            alertConditions=[
                AlertCondition(
                    Target(refId=ALERT_REF_ID),
                    timeRange=TimeRange("15m", "now"),
                    evaluator=GreaterThan(0),
                    reducerType=RTYPE_MAX,
                    operator=OP_AND,
                )
            ],
            frequency="2m",
            gracePeriod="2m",
            notifications=notifications,
        )

    return Graph(
        title="API Gateway Requests: {}".format(name),
        dataSource=cloudwatch_data_source,
        targets=targets,
        seriesOverrides=seriesOverrides,
        yAxes=yAxes,
        transparent=TRANSPARENT,
        editable=EDITABLE,
        alert=alert,
        alertThreshold=ALERT_THRESHOLD,
    ).auto_ref_ids()
Exemplo n.º 20
0
         ),
     ],
     yAxes=YAxes(
         YAxis(format=OPS_FORMAT),
         YAxis(format=SHORT_FORMAT),
     ),
     alert=Alert(
         name="Too many 500s on Nginx",
         message="More than 5 QPS of 500s on Nginx for 5 minutes",
         alertConditions=[
             AlertCondition(
                 Target(
                     expr=
                     'sum(irate(nginx_http_requests_total{job="default/frontend",status=~"5.."}[1m]))',
                     legendFormat="5xx",
                     refId='A',
                 ),
                 timeRange=TimeRange("5m", "now"),
                 evaluator=GreaterThan(5),
                 operator=OP_AND,
                 reducerType=RTYPE_SUM,
             ),
         ],
         notifications=[
             Notification("notification_channel_uid"),
         ],
     )),
 Graph(
     title="Frontend latency",
     dataSource='My Prometheus',
     targets=[
         Target(