def node_errors() -> Graph: return Graph( title="node event recording count", dataSource=DATASOURCE, targets=[ Target( expr= f'sum(rate(flyte:propeller:all:node:perma_system_error_duration_unlabeled_ms_count[5m]))', legendFormat="system error", refId='A', ), Target( expr= f'sum(rate(flyte:propeller:all:node:perma_user_error_duration_unlabeled_ms[5m]))', legendFormat="user error", refId='A', ), Target( expr= f'sum(rate(flyte:propeller:all:node:perma_unknown_error_duration_unlabeled_ms[5m]))', legendFormat="user error", refId='A', ), ], yAxes=single_y_axis(format=NO_FORMAT), )
def wf_event_recording() -> typing.List[Graph]: return [ Graph( title="wf event recording latency success", dataSource=DATASOURCE, targets=[ Target( expr= f'sum(flyte:propeller:all:workflow:event_recording:success_duration_ms) by (quantile, wf)', refId='A', ), ], yAxes=single_y_axis(format=MILLISECONDS_FORMAT), ), Graph( title="wf event recording count", dataSource=DATASOURCE, targets=[ Target( expr= f'sum(rate(flyte:propeller:all:workflow:event_recording:success_duration_ms_count[5m])) by (wf)', legendFormat="success", refId='A', ), Target( expr= f'sum(rate(flyte:propeller:all:workflow:event_recording:failure_duration_ms_count[5m])) by (wf)', legendFormat="failure", refId='A', ), ], yAxes=single_y_axis(format=NO_FORMAT), ), ]
def number_of_active_processes_graph(grr_component): return Graph( title="Number of Active Processes", targets=[ Target( expr='sum(up{{job="grr_{}"}})'.format(grr_component), legendFormat="Active Processes", ), ], alert=Alert( name="Number of Active Processes alert", message="The number of active {} processes is below {}".format( grr_component.capitalize(), config.ACTIVE_PROCESSES_ALERTING_CONDITION), alertConditions=[ AlertCondition(Target( expr='sum(up{{job="grr_{}"}})'.format(grr_component), legendFormat="Active Processes", ), timeRange=TimeRange("10s", "now"), evaluator=LowerThan( config.ACTIVE_PROCESSES_ALERTING_CONDITION), operator=OP_AND, reducerType=RTYPE_SUM) ], ))
def errors(collapse: bool) -> Row: return Row( title="Error (System vs user)", collapse=collapse, panels=[ Graph( title="User errors", dataSource=DATASOURCE, targets=[ Target( expr= 'sum(rate(flyte:propeller:all:node:user_error_duration_ms_count{project=~"$project",domain=~"$domain",wf=~"$project:$domain:$workflow"}[5m]))', refId='A', ), ], yAxes=single_y_axis(format=SHORT_FORMAT), ), Graph( title="System errors", dataSource=DATASOURCE, targets=[ Target( expr= 'sum(rate(flyte:propeller:all:node:system_error_duration_ms_count{project=~"$project",domain=~"$domain",wf=~"$project:$domain:$workflow"}[5m]))', refId='A', ), ], yAxes=single_y_axis(format=SHORT_FORMAT), ), ])
def dynamic_wf_build() -> typing.List[Graph]: return [ Graph( title="Dynamic workflow build latency", dataSource=DATASOURCE, targets=[ Target( expr= f'sum(flyte:propeller:all:node:build_dynamic_workflow_us) by (quantile, wf) / 1000', refId='A', ), ], yAxes=single_y_axis(format=MILLISECONDS_FORMAT), ), Graph( title="Dynamic workflow build count", dataSource=DATASOURCE, targets=[ Target( expr= f'sum(rate(flyte:propeller:all:node:build_dynamic_workflow_us_count[5m])) by (wf)', refId='A', ), ], yAxes=single_y_axis(format=NO_FORMAT), ), ]
def error_codes(api: str, interval: int = 1) -> Graph: return Graph( title=f"{api} return codes", dataSource=DATASOURCE, targets=[ Target( expr=f'sum(irate(flyte:admin:{api}:codes:OK[{interval}m]))', legendFormat="ok", refId='A', ), Target( expr=f'sum(irate(flyte:admin:{api}:codes:InvalidArgument[{interval}m]))', legendFormat="invalid-args", refId='B', ), Target( expr=f'sum(irate(flyte:admin:{api}:codes:AlreadyExists[{interval}m]))', legendFormat="already-exists", refId='C', ), Target( expr=f'sum(irate(flyte:admin:{api}:codes:FailedPrecondition[{interval}m]))', legendFormat="failed-precondition", refId='D', ), ], yAxes=YAxes( YAxis(format=OPS_FORMAT), YAxis(format=SHORT_FORMAT), ), )
def wf_store_latency(collapse: bool) -> Row: return Row( title="etcD write metrics", collapse=collapse, panels=[ Graph( title="wf update etcD latency", dataSource=DATASOURCE, targets=[ Target( expr= f'sum(flyte:propeller:all:wf_update_latency_ms) by (quantile)', refId='A', ), ], yAxes=single_y_axis(format=MILLISECONDS_FORMAT), ), Graph( title="etcD writes", dataSource=DATASOURCE, targets=[ Target( expr= f'sum(rate(flyte:propeller:all:wf_update_latency_ms_count[5m]))', refId='A', ), ], yAxes=single_y_axis(format=NO_FORMAT), ), Graph( title="etcD write conflicts", dataSource=DATASOURCE, targets=[ Target( expr= f'sum(rate(flyte:propeller:all:wf_update_conflict[5m]))', refId='A', ), ], yAxes=single_y_axis(format=NO_FORMAT), ), Graph( title="etcD write fail", dataSource=DATASOURCE, targets=[ Target( expr= f'sum(rate(flyte:propeller:all:wf_update_failed[5m]))', refId='A', ), ], yAxes=single_y_axis(format=NO_FORMAT), ), ])
def threadpool_outstanding_tasks_vs_threads_num(grr_component): return Graph( title="Outstanding Tasks vs. Number of Threads", targets=[ Target( expr='sum(threadpool_outstanding_tasks{{job="grr_{}"}})'.format(grr_component), legendFormat="Outstanding Tasks", ), Target( expr='sum(threadpool_threads{{job="grr_{}"}})'.format(grr_component), legendFormat="Threads", ), ])
def export(self, output_file=''): from baskerville.models.metrics.registry import metrics_registry panels = [] for i, (metric_name, value) in enumerate(metrics_registry.registry.items()): if i % 4 == 0 or i == len(metrics_registry.registry): self.rows.append(Row(panels=panels)) panels = [] if 'timer' in metric_name: g = Gauge() g.maxValue = 0 g.maxValue = 100 g.show = True g.thresholdMarkers = True panels.append( SingleStat( title=metric_name, dataSource=self.ds, gauge=g, targets=[ Target( expr= f'({metric_name}_sum / {metric_name}_count)', target=metric_name, refId='A', metric=metric_name, datasource=self.ds, ) ])) else: panels.append( Graph(title=metric_name, dataSource=self.ds, targets=[ Target(expr=f'{metric_name}_total' if 'total' in metric_name else metric_name, target=metric_name, refId='A', metric=metric_name, datasource=self.ds) ])) for panel in panels: self.rows.append(Row(panels=[panel])) self.dashboard = Dashboard(title=self.dash_title, rows=self.rows).auto_panel_ids() with open(output_file, 'w') as f: write_dashboard(self.dashboard, f)
def quota_stats(collapse: bool) -> Row: return Row( title="Kubernetes Quota Usage stats", collapse=collapse, panels=[ Graph( title="CPU Limits vs usage", dataSource=DATASOURCE, targets=[ Target( expr= 'kube_resourcequota{resource="limits.cpu", namespace="$project-$domain", type="hard"}', refId='A', legendFormat="max cpu", ), Target( expr= 'kube_resourcequota{resource="limits.cpu", namespace="$project-$domain", type="used"}', refId='B', legendFormat="used cpu", ), ], yAxes=YAxes( YAxis(format=OPS_FORMAT), YAxis(format=SHORT_FORMAT), ), ), Graph( title="Mem Limits vs usage", dataSource=DATASOURCE, targets=[ Target( expr= 'kube_resourcequota{resource="limits.memory", namespace="$project-$domain", type="hard"}', refId='A', legendFormat="max mem", ), Target( expr= 'kube_resourcequota{resource="limits.memory", namespace="$project-$domain", type="used"}', refId='B', legendFormat="used mem", ), ], yAxes=YAxes( YAxis(format=OPS_FORMAT), YAxis(format=SHORT_FORMAT), ), ), ])
def test_should_generate_lambda_memory_utilization_percentage_graph_with_alert_notifications( self, ): lambda_name = "lambda-1" cloudwatch_data_source = "cloudwatch" lambda_insights_namespace = "insights" notifications = ["lorem", "ipsum"] expected_alert_query = CloudwatchMetricsTarget( alias="Avg", namespace=lambda_insights_namespace, statistics=["Average"], metricName="memory_utilization", dimensions={"function_name": lambda_name}, refId="A", ) generated_lambda_graph = lambda_generate_memory_utilization_percentage_graph( name=lambda_name, cloudwatch_data_source=cloudwatch_data_source, lambda_insights_namespace=lambda_insights_namespace, notifications=notifications, ) generated_lambda_graph.should.have.property("alert").be.a(Alert) generated_lambda_graph.alert.executionErrorState.should.eql("alerting") generated_lambda_graph.alert.noDataState.should.eql("no_data") generated_lambda_graph.alert.alertConditions.should.have.length_of(1) generated_lambda_graph.alert.alertConditions[0].should.be.a( AlertCondition) generated_lambda_graph.alert.alertConditions[0].target.should.eql( Target(refId="A")) generated_lambda_graph.targets.should.contain(expected_alert_query)
def test_should_generate_mem_utilization_percentage_with_alerts_graph(self): name = "service-1" cloudwatch_data_source = "prod" cluster_name = "cluster-1" grid_pos = GridPos(1, 2, 3, 4) notifications = ["foo", "bar", "baz"] expected_alert_condition = AlertCondition( Target(refId="A"), timeRange=TimeRange("15m", "now"), evaluator=GreaterThan(85), reducerType=RTYPE_MAX, operator=OP_AND, ) panel = generate_mem_utilization_percentage_graph( name=name, cloudwatch_data_source=cloudwatch_data_source, cluster_name=cluster_name, grid_pos=grid_pos, notifications=notifications, ) panel.alert.should.be.a(Alert) panel.alert.alertConditions.should.have.length_of(1) panel.alert.alertConditions[0].should.eql(expected_alert_condition) panel.alert.notifications.should.eql(notifications)
def test_should_generate_pending_count_with_alerts_graph(self): name = "service-1" cloudwatch_data_source = "prod" cluster_name = "cluster-1" grid_pos = GridPos(1, 2, 3, 4) notifications = ["foo", "bar"] panel = generate_pending_count_graph( name=name, cloudwatch_data_source=cloudwatch_data_source, cluster_name=cluster_name, grid_pos=grid_pos, notifications=notifications, ) panel.alert.should.be.a(Alert) panel.alert.gracePeriod.should.eql("15m") panel.alert.alertConditions.should.have.length_of(1) panel.alert.alertConditions[0].should.eql( AlertCondition( Target(refId="A"), timeRange=TimeRange("5m", "now"), evaluator=GreaterThan(0), reducerType=RTYPE_MAX, operator=OP_AND, ) )
def test_should_generate_res_count_graph_with_alert(self): name = "service-1" cloudwatch_data_source = "prod" loadbalancer = "loadbalancer-1" target_group = "target-group-1" grid_pos = GridPos(1, 2, 3, 4) notifications = ["foo", "bar", "baz"] panel = generate_res_count_graph( name=name, cloudwatch_data_source=cloudwatch_data_source, grid_pos=grid_pos, loadbalancer=loadbalancer, target_group=target_group, notifications=notifications, ) panel.alert.should.be.a(Alert) panel.alert.message.should.eql("{} has 5XX errors".format(name)) panel.alert.alertConditions.should.have.length_of(1) panel.alert.alertConditions.should.eql( [ AlertCondition( Target(refId="A"), timeRange=TimeRange("15m", "now"), evaluator=GreaterThan(0), reducerType=RTYPE_MAX, operator=OP_AND, ), ] )
def generate_elasticsearch_status_red_alert_graph( name: str, client_id: str, cloudwatch_data_source: str, notifications: List[str]) -> Graph: """ Generate Elasticsearch graph """ y_axes = YAxes( YAxis(format=SHORT_FORMAT), YAxis(format=SHORT_FORMAT), ) targets = [ CloudwatchMetricsTarget( alias="Red status", namespace=NAMESPACE, period="1m", statistics=["Maximum"], dimensions={ "DomainName": name, "ClientId": client_id }, metricName="ClusterStatus.red", ), ] alert = None if notifications: alert = Alert( name="Elasticsearch is in status red", message="Elasticsearch is in status red", executionErrorState="alerting", alertConditions=[ AlertCondition( Target(refId=ALERT_REF_ID), timeRange=TimeRange("5m", "now"), evaluator=GreaterThan(0), reducerType=RTYPE_MAX, operator=OP_OR, ), ], frequency="2m", gracePeriod="2m", notifications=notifications, ) return Graph( title="Status RED alerts", dataSource=cloudwatch_data_source, targets=targets, yAxes=y_axes, transparent=TRANSPARENT, editable=EDITABLE, bars=True, lines=False, alert=alert, ).auto_ref_ids()
def db_operations_errors(grr_component): return Graph( title="Database Operations Errors Rate by Call", targets=[ Target( expr='sum by (call) (rate(db_request_errors_total{{job="grr_{0}"}}[10m]))'.format(grr_component), legendFormat="{{call}}", ), ])
def db_operations_latency(grr_component): return Graph( title="Database Operations Latency by Call", targets=[ Target( expr='sum by (call) (rate(db_request_latency_sum{{job="grr_{0}"}}[10m]) / rate(db_request_latency_count{{job="grr_{0}"}}[10m]))'.format(grr_component), legendFormat="{{call}}", ), ])
def sum_process_memory_bytes(grr_component): return Graph( title="Sum of Process Memory Bytes (across all instances)", targets=[ Target( expr='sum(process_resident_memory_bytes{{job="grr_{}"}})'.format(grr_component), legendFormat="Resident Memory", ), ])
def metastore_failures(): # Copy counts sum(rate(flyte:propeller:all:metastore:copy:overall_unlabeled_ms_count[5m])) return Graph( title=f"Failures from metastore", dataSource=DATASOURCE, targets=[ Target( expr= f'sum(rate(flyte:propeller:all:metastore:head_failure_unlabeled[5m]))', legendFormat="head-failure", refId='A', ), Target( expr= f'sum(rate(flyte:propeller:all:metastore:bad_container_unlabeled[5m]))', legendFormat="bad-container", refId='A', ), Target( expr= f'sum(rate(flyte:propeller:all:metastore:bad_key_unlabeled[5m]))', legendFormat="bad-key", refId='A', ), Target( expr= f'sum(rate(flyte:propeller:all:metastore:read_failure_unlabeled[5m]))', legendFormat="read-failure", refId='A', ), Target( expr= f'sum(rate(flyte:propeller:all:metastore:write_failure_unlabeled[5m]))', legendFormat="write-failure", refId='A', ), ], yAxes=YAxes( YAxis(format=OPS_FORMAT), YAxis(format=SHORT_FORMAT), ), )
def resource_stats(collapse: bool) -> Row: return Row( title="Task stats", collapse=collapse, panels=[ Graph( title="Pending tasks", dataSource=DATASOURCE, targets=[ Target( expr= 'sum(kube_pod_container_status_waiting * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !="",namespace=~"$project-$domain",label_workflow_name=~"$workflow"}) by (namespace, label_execution_id, label_task_name, label_node_id, label_workflow_name) > 0', refId='A', ), ], yAxes=single_y_axis(format=SHORT_FORMAT), ), Graph( title="Memory Usage Percentage", dataSource=DATASOURCE, targets=[ Target( expr= '(max(container_memory_rss{image!=""} * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !="",namespace=~"$project-$domain",label_workflow_name=~"$workflow"} * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_execution_id, label_task_name, label_node_id, label_workflow_name) / max(kube_pod_container_resource_limits_memory_bytes{container!=""} * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !=""} * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_execution_id, label_task_name, label_node_id, label_workflow_name)) > 0', refId='A', ), ], yAxes=single_y_axis(format=SHORT_FORMAT), ), Graph( title="CPU Usage Percentage", dataSource=DATASOURCE, targets=[ Target( expr= '(sum(rate(container_cpu_usage_seconds_total{image!=""}[2m]) * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !="",namespace=~"$project-$domain",label_workflow_name=~"$workflow"} * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_execution_id, label_task_name, label_node_id, label_workflow_name) / sum(kube_pod_container_resource_limits_cpu_cores{container!=""} * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !=""} * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_execution_id, label_task_name, label_node_id, label_workflow_name)) > 0', refId='A', ), ], yAxes=single_y_axis(format=SHORT_FORMAT), ), ])
def admin_launcher_cache() -> Graph: return Graph( title="Admin Launcher cache", dataSource=DATASOURCE, targets=[ Target( expr= f'sum(rate(flyte:propeller:all:admin_launcher:cache_hit[5m]))', legendFormat="hit", refId='A', ), Target( expr= f'sum(rate(flyte:propeller:all:admin_launcher:cache_miss[5m]))', legendFormat="miss", refId='B', ), ], yAxes=single_y_axis(format=MILLISECONDS_FORMAT), )
def generate_desired_count_graph( name: str, cluster_name: str, max: int, cloudwatch_data_source: str, notifications: List[str], grid_pos: GridPos, ): targets = [ CloudwatchMetricsTarget( alias="Containers", namespace=CONTAINER_INSIGHTS_NAMESPACE, statistics=["Maximum"], metricName="DesiredTaskCount", dimensions={ "ServiceName": name, "ClusterName": cluster_name }, refId=ALERT_REF_ID, ), ] alert = None if notifications and max > 1: alert = Alert( name="{} Desired count of containers nearing the max".format(name), message="{} is having Desired count of containers nearing the max". format(name), executionErrorState="alerting", alertConditions=[ AlertCondition( Target(refId=ALERT_REF_ID), timeRange=TimeRange("15m", "now"), evaluator=GreaterThan(0.9 * max), # 90% of max reducerType=RTYPE_MAX, operator=OP_AND, ) ], gracePeriod="1m", notifications=notifications, ) return Graph( title="Desired Tasks", dataSource=cloudwatch_data_source, targets=targets, transparent=TRANSPARENT, editable=EDITABLE, alert=alert, gridPos=grid_pos, alertThreshold=ALERT_THRESHOLD, ).auto_ref_ids()
def generate_rds_transaction_id_graph(name: str, cloudwatch_data_source: str, notifications: List[str]): """ Generate rds graph """ y_axes = single_y_axis(format=SHORT_FORMAT) targets = [ CloudwatchMetricsTarget( alias="Transaction ids used", metricName="MaximumUsedTransactionIDs", statistics=["Maximum"], namespace=NAMESPACE, dimensions={"DBInstanceIdentifier": name}, period="1m", refId=ALERT_REF_ID, ), ] alert = None if notifications: alert = Alert( name="{} transaction ids used Errors".format(name), message="{} is having transaction ids used errors".format(name), executionErrorState="alerting", alertConditions=[ AlertCondition( Target(refId=ALERT_REF_ID), timeRange=TimeRange("5m", "now"), evaluator=GreaterThan(1000000000), reducerType=RTYPE_MAX, operator=OP_AND, ) ], gracePeriod="2m", frequency="2m", notifications=notifications, ) return Graph( title="Transaction ids used", dataSource=cloudwatch_data_source, targets=targets, yAxes=y_axes, transparent=TRANSPARENT, editable=EDITABLE, bars=False, lines=True, alert=alert, ).auto_ref_ids()