def export(self, output_file=''): from baskerville.models.metrics.registry import metrics_registry panels = [] for i, (metric_name, value) in enumerate(metrics_registry.registry.items()): if i % 4 == 0 or i == len(metrics_registry.registry): self.rows.append(Row(panels=panels)) panels = [] if 'timer' in metric_name: g = Gauge() g.maxValue = 0 g.maxValue = 100 g.show = True g.thresholdMarkers = True panels.append( SingleStat( title=metric_name, dataSource=self.ds, gauge=g, targets=[ Target( expr= f'({metric_name}_sum / {metric_name}_count)', target=metric_name, refId='A', metric=metric_name, datasource=self.ds, ) ])) else: panels.append( Graph(title=metric_name, dataSource=self.ds, targets=[ Target(expr=f'{metric_name}_total' if 'total' in metric_name else metric_name, target=metric_name, refId='A', metric=metric_name, datasource=self.ds) ])) for panel in panels: self.rows.append(Row(panels=[panel])) self.dashboard = Dashboard(title=self.dash_title, rows=self.rows).auto_panel_ids() with open(output_file, 'w') as f: write_dashboard(self.dashboard, f)
def errors(collapse: bool) -> Row: return Row( title="Error (System vs user)", collapse=collapse, panels=[ Graph( title="User errors", dataSource=DATASOURCE, targets=[ Target( expr= 'sum(rate(flyte:propeller:all:node:user_error_duration_ms_count{project=~"$project",domain=~"$domain",wf=~"$project:$domain:$workflow"}[5m]))', refId='A', ), ], yAxes=single_y_axis(format=SHORT_FORMAT), ), Graph( title="System errors", dataSource=DATASOURCE, targets=[ Target( expr= 'sum(rate(flyte:propeller:all:node:system_error_duration_ms_count{project=~"$project",domain=~"$domain",wf=~"$project:$domain:$workflow"}[5m]))', refId='A', ), ], yAxes=single_y_axis(format=SHORT_FORMAT), ), ])
def row(self, elements=None, **kw): elements = elements or [] panels = [] for element in elements: element.dataSource = getattr(element, 'dataSource', '') or self.dataSource panels += [element] self.rows += [Row(panels=panels, **kw)] return self
def create_api_row(api: str, collapse: bool, interval: int = 1) -> Row: return Row(title=f"{api} stats", collapse=collapse, panels=[ FlyteAdmin.error_codes(api, interval), FlyteAdmin.error_vs_success(api, interval), FlyteAdmin.api_latency(api, interval), ])
def metastore_metrics(interval: int, collapse: bool) -> Row: return Row( title="Metastore failures and cache", collapse=collapse, panels=[ FlytePropeller.metastore_cache_hit_percentage(interval), FlytePropeller.metastore_failures(), ], )
def node_metrics(collapse: bool) -> Row: return Row( title="Node Metrics", collapse=collapse, panels=[ FlytePropeller.node_exec_latency(), FlytePropeller.node_input_latency(), FlytePropeller.node_event_recording_latency(), ], )
def wf_store_latency(collapse: bool) -> Row: return Row( title="etcD write metrics", collapse=collapse, panels=[ Graph( title="wf update etcD latency", dataSource=DATASOURCE, targets=[ Target( expr= f'sum(flyte:propeller:all:wf_update_latency_ms) by (quantile)', refId='A', ), ], yAxes=single_y_axis(format=MILLISECONDS_FORMAT), ), Graph( title="etcD writes", dataSource=DATASOURCE, targets=[ Target( expr= f'sum(rate(flyte:propeller:all:wf_update_latency_ms_count[5m]))', refId='A', ), ], yAxes=single_y_axis(format=NO_FORMAT), ), Graph( title="etcD write conflicts", dataSource=DATASOURCE, targets=[ Target( expr= f'sum(rate(flyte:propeller:all:wf_update_conflict[5m]))', refId='A', ), ], yAxes=single_y_axis(format=NO_FORMAT), ), Graph( title="etcD write fail", dataSource=DATASOURCE, targets=[ Target( expr= f'sum(rate(flyte:propeller:all:wf_update_failed[5m]))', refId='A', ), ], yAxes=single_y_axis(format=NO_FORMAT), ), ])
def perf_metrics(collapse: bool) -> Row: r = Row( title="Perf metrics", collapse=collapse, panels=[], ) r.panels.extend(FlytePropeller.wf_event_recording()) r.panels.extend(FlytePropeller.node_event_recording()) r.panels.extend(FlytePropeller.task_event_recording()) r.panels.extend(FlytePropeller.dynamic_wf_build()) r.panels.append(FlytePropeller.admin_launcher_cache()) return r
def create_entity_db_count(entity: str, collapse: bool, interval: int = 1) -> Row: r = Row( title=f"DB {entity} ops stats", collapse=collapse, panels=[], ) for op in FlyteAdmin.DB_OPS: r.panels.append( FlyteAdmin.db_count(entity, op=op, interval=interval)) return r
def create_lambda_only_dashboard( tags: List[str], name: str, cloudwatch_data_source: str, lambda_insights_namespace: str, notifications: List[str], environment: str, *args, **kwargs, ): """Create a dashboard with just the lambda""" return Dashboard( title="{}{}".format(LAMBDA_DASHBOARD_PREFIX, name), editable=EDITABLE, tags=tags + ["lambda", environment], timezone=TIMEZONE, sharedCrosshair=SHARED_CROSSHAIR, rows=[ Row(panels=[ lambda_generate_invocations_graph( name, cloudwatch_data_source, notifications=notifications), lambda_generate_duration_graph(name, cloudwatch_data_source), ]), Row(panels=[ lambda_generate_memory_utilization_percentage_graph( name, cloudwatch_data_source, lambda_insights_namespace, notifications=notifications, ), lambda_generate_memory_utilization_graph( name, cloudwatch_data_source, lambda_insights_namespace), ]), Row(panels=[ lambda_generate_logs_panel(name, cloudwatch_data_source), ]), ], ).auto_panel_ids()
def quota_stats(collapse: bool) -> Row: return Row( title="Kubernetes Quota Usage stats", collapse=collapse, panels=[ Graph( title="CPU Limits vs usage", dataSource=DATASOURCE, targets=[ Target( expr= 'kube_resourcequota{resource="limits.cpu", namespace="$project-$domain", type="hard"}', refId='A', legendFormat="max cpu", ), Target( expr= 'kube_resourcequota{resource="limits.cpu", namespace="$project-$domain", type="used"}', refId='B', legendFormat="used cpu", ), ], yAxes=YAxes( YAxis(format=OPS_FORMAT), YAxis(format=SHORT_FORMAT), ), ), Graph( title="Mem Limits vs usage", dataSource=DATASOURCE, targets=[ Target( expr= 'kube_resourcequota{resource="limits.memory", namespace="$project-$domain", type="hard"}', refId='A', legendFormat="max mem", ), Target( expr= 'kube_resourcequota{resource="limits.memory", namespace="$project-$domain", type="used"}', refId='B', legendFormat="used mem", ), ], yAxes=YAxes( YAxis(format=OPS_FORMAT), YAxis(format=SHORT_FORMAT), ), ), ])
def core_metrics(interval: int, collapse: bool) -> Row: return Row( title="Core metrics", collapse=collapse, panels=[ FlytePropeller.create_free_workers(), FlytePropeller.abort_errors(), FlytePropeller.system_errors(), FlytePropeller.plugin_success_vs_failures(), FlytePropeller.round_latency(interval), FlytePropeller.round_latency_per_wf(interval), FlytePropeller.round_panic(), FlytePropeller.workflows_per_project(), ], )
def create_dashboard( title: str, datasource_name: str, queries: List[RawInfluxDbQuery], start: datetime, end: datetime, timezone: str, yaxe_types: List[YAxis], thresholds: List[Threshold], grafana_graph_params: Dict[str, any], ) -> Dashboard: """ Create a dashboard object that can be serialized to JSON and sent to Grafana. """ targets = [] series_overrides = [] for query in queries: targets.append(InfluxDBTarget(query=query.query, alias=query.alias)) if query.yaxis == "right": series_overrides.append(SeriesOverride(alias=query.alias, yaxis=2)) left = yaxe_types[0] right = yaxe_types[1] if len(yaxe_types) > 1 else None yaxes = YAxes(left, right) if right else YAxes(left=left) return Dashboard( title=title, time=Time(start.isoformat(), end.isoformat()), timezone=timezone, rows=[ Row(panels=[ CustomGraph( title=title, dataSource=datasource_name, targets=targets, thresholds=thresholds, seriesOverrides=series_overrides, yAxes=yaxes, transparent=True, **grafana_graph_params, ), ], ), ], ).auto_panel_ids()
def resource_stats(collapse: bool) -> Row: return Row( title="Task stats", collapse=collapse, panels=[ Graph( title="Pending tasks", dataSource=DATASOURCE, targets=[ Target( expr= 'sum(kube_pod_container_status_waiting * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !="",namespace=~"$project-$domain",label_workflow_name=~"$workflow"}) by (namespace, label_execution_id, label_task_name, label_node_id, label_workflow_name) > 0', refId='A', ), ], yAxes=single_y_axis(format=SHORT_FORMAT), ), Graph( title="Memory Usage Percentage", dataSource=DATASOURCE, targets=[ Target( expr= '(max(container_memory_rss{image!=""} * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !="",namespace=~"$project-$domain",label_workflow_name=~"$workflow"} * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_execution_id, label_task_name, label_node_id, label_workflow_name) / max(kube_pod_container_resource_limits_memory_bytes{container!=""} * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !=""} * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_execution_id, label_task_name, label_node_id, label_workflow_name)) > 0', refId='A', ), ], yAxes=single_y_axis(format=SHORT_FORMAT), ), Graph( title="CPU Usage Percentage", dataSource=DATASOURCE, targets=[ Target( expr= '(sum(rate(container_cpu_usage_seconds_total{image!=""}[2m]) * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !="",namespace=~"$project-$domain",label_workflow_name=~"$workflow"} * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_execution_id, label_task_name, label_node_id, label_workflow_name) / sum(kube_pod_container_resource_limits_cpu_cores{container!=""} * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !=""} * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_execution_id, label_task_name, label_node_id, label_workflow_name)) > 0', refId='A', ), ], yAxes=single_y_axis(format=SHORT_FORMAT), ), ])
def generate_firehose_dashboard( influxdb_data_source: str, environment: str, *args, **kwargs ) -> Dashboard: """Generate Firehose dashboard""" tags = ["firehose", environment] rows = [ Row( panels=[generate_firehose_graph(influxdb_data_source=influxdb_data_source)], editable=EDITABLE, repeat="firehose", title="$firehose", ) ] return Dashboard( title="Firehose", editable=EDITABLE, tags=tags, timezone=TIMEZONE, sharedCrosshair=SHARED_CROSSHAIR, rows=rows, ).auto_panel_ids()
def metastore_latencies(collapse: bool) -> Row: return Row( title=f"Metastore latencies", collapse=collapse, panels=[ Graph( title=f"Metastore copy latency", dataSource=DATASOURCE, targets=[ Target( expr=f'sum(flyte:propeller:all:metastore:copy:overall_unlabeled_ms) by (quantile)', refId='A', ), ], yAxes=single_y_axis(format=MILLISECONDS_FORMAT), ), Graph( title=f"Metastore write latency by workflow", dataSource=DATASOURCE, targets=[ Target( expr='sum(flyte:propeller:all:metastore:write_ms) by (quantile, wf)', refId='A', ), ], yAxes=single_y_axis(format=MILLISECONDS_FORMAT), ), Graph( title=f"Metastore read open latency by workflow", dataSource=DATASOURCE, targets=[ Target( expr='sum(flyte:propeller:all:metastore:read_open_ms) by (quantile, wf)', refId='A', ), ], yAxes=single_y_axis(format=MILLISECONDS_FORMAT), ), Graph( title=f"Metastore head latency by workflow", dataSource=DATASOURCE, targets=[ Target( expr='sum(flyte:propeller:all:metastore:head_ms) by (quantile, wf)', refId='A', ), ], yAxes=single_y_axis(format=MILLISECONDS_FORMAT), ), Graph( title=f"Metastore fetch latency by workflow", dataSource=DATASOURCE, targets=[ Target( expr='sum(flyte:propeller:all:metastore:proto_fetch_ms) by (quantile, wf)', legendFormat="proto-fetch", refId='A', ), Target( expr='sum(flyte:propeller:all:metastore:remote_fetch_ms) by (quantile, wf)', legendFormat="remote-fetch", refId='B', ), ], yAxes=single_y_axis(format=MILLISECONDS_FORMAT), ), ] )
def generate_elasticsearch_dashboard( name: str, client_id: str, influxdb_data_source: str, cloudwatch_data_source: str, environment: str, notifications: List[str], *args, **kwargs, ): """Generate Elasticsearch dashboard""" tags = ["elasticsearch", environment] rows = [ Row( panels=[ generate_elasticsearch_cpu_graph( name=name, client_id=client_id, cloudwatch_data_source=cloudwatch_data_source, ), generate_elasticsearch_jvm_memory_pressure_graph( name=name, client_id=client_id, cloudwatch_data_source=cloudwatch_data_source, notifications=notifications, ), ], editable=EDITABLE, ), Row( panels=[ generate_elasticsearch_documents_graph( name=name, client_id=client_id, cloudwatch_data_source=cloudwatch_data_source, ) ], editable=EDITABLE, ), Row( panels=[ generate_elasticsearch_storage_graph( name=name, client_id=client_id, cloudwatch_data_source=cloudwatch_data_source, notifications=notifications, ) ], editable=EDITABLE, ), Row( panels=[ generate_elasticsearch_requests_graph( name=name, client_id=client_id, cloudwatch_data_source=cloudwatch_data_source, ) ], editable=EDITABLE, ), Row( panels=[ generate_elasticsearch_status_red_alert_graph( name=name, client_id=client_id, cloudwatch_data_source=cloudwatch_data_source, notifications=notifications, ), generate_elasticsearch_nodes_alert_graph( name=name, client_id=client_id, cloudwatch_data_source=cloudwatch_data_source, notifications=notifications, ), generate_elasticsearch_writes_blocked_alert_graph( name=name, client_id=client_id, cloudwatch_data_source=cloudwatch_data_source, notifications=notifications, ), generate_elasticsearch_automated_snapshot_failure_alert_graph( name=name, client_id=client_id, cloudwatch_data_source=cloudwatch_data_source, notifications=notifications, ), ], editable=EDITABLE, ), ] return Dashboard( title="Elasticsearch: {}".format(name), editable=EDITABLE, tags=tags, timezone=TIMEZONE, sharedCrosshair=SHARED_CROSSHAIR, rows=rows, links=[DOCUMENTATION_LINK], refresh=DEFAULT_REFRESH, ).auto_panel_ids()
Row(panels=[ Graph( title="Freeable Memory", dataSource=DATASOURCE, targets=[ Target( expr= 'aws_ec_freeable_memory_average{environment="${environment}", dimension_cache_cluster_id=~"${cluster}.*"}', legendFormat="{{dimension_cache_cluster_id}}", refId='A', ), ], yAxes=YAxes( YAxis(format=BYTES_FORMAT), YAxis(format=SHORT_FORMAT), ), ), Graph( title="Bytes Used for Cache", dataSource=DATASOURCE, targets=[ Target( expr= 'aws_ec_bytes_used_for_cache_average{environment="${environment}", dimension_cache_cluster_id=~"${cluster}.*"}', legendFormat="{{dimension_cache_cluster_id}}", refId='A', ), ], yAxes=YAxes( YAxis(format=BYTES_FORMAT), YAxis(format=SHORT_FORMAT), ), ) ]),
from grafanalib.core import Dashboard, Graph, Row, Target from grr_grafanalib_dashboards.util import add_data_source from grr_grafanalib_dashboards.reusable_panels import GENERAL_PANELS from grr_grafanalib_dashboards.config import GRAFANA_DATA_SOURCE GRR_COMPONENT = "admin_ui" dashboard = Dashboard( title="{}s Dashboard".format(GRR_COMPONENT).title().replace("_", " "), rows=[ Row(panels=[panel(GRR_COMPONENT) for panel in row]) for row in GENERAL_PANELS ] + [ Row(panels=[ Graph( title="API Method Latency Rate", targets=[ Target( expr= 'rate(api_method_latency_sum[10m]) / rate(api_method_latency_count[10m])', legendFormat="Latency - Method: {{method_name}}", ), ], ), Graph( title="API Calls Count Rate by Status SUCCESS", targets=[ Target( expr= 'sum(rate(api_method_latency_count{status="SUCCESS"}[10m]))', legendFormat="Successful Calls Rate",
GRAFANA_API_URL = 'http://statsd:3000/api/dashboards/db/' target = Target(target='servers.prod-mysql-maindb*.df-data.df_complex-used', datasource='default') panel = Graph( title='mysql-maindb disk consumption', dataSource='default', targets=[target], yAxes=[ YAxis(format=BYTES_FORMAT), YAxis(format=SHORT_FORMAT), ], ) row = Row(panels=[panel]) db = Dashboard( title='Autogenerated MySQL Disk Consumption', rows=[row], time=Time('now-6M', 'now'), ) s = StringIO.StringIO() write_dashboard(db, s) dashboard_json = s.getvalue() print dashboard_json payload = { "dashboard": json.loads(dashboard_json), "overwrite": True
BYTES_FORMAT, OPS_FORMAT, single_y_axis, Target) dashboard = Dashboard( title="Test Resolver dashboard", rows=[ Row(panels=[ Graph( title="gRPC Rate", dataSource='Prometheus', targets=[ Target( expr= 'rate(grpc_server_handled_total{grpc_service="ResolverService"}[1m])', legendFormat="Total-{{pod}}", refId='A', ), Target( expr= 'rate(grpc_server_handled_total{grpc_method="Resolve", grpc_service="ResolverService"}[1m])', legendFormat="Resolve-{{pod}}", refId='B', ) ], xAxis=XAxis(mode="time"), yAxes=single_y_axis(format=OPS_FORMAT, min=None), ), ]), Row(panels=[ Graph( title="gRPC latency", dataSource='Prometheus', targets=[
def generate_rds_dashboard( name: str, environment: str, influxdb_data_source: str, cloudwatch_data_source: str, engine: str, notifications: List[str], **kwargs, ): tags = [environment, engine, "rds", "database"] cpu_graph = generate_rds_cpu_graph( name=name, cloudwatch_data_source=cloudwatch_data_source, notifications=notifications, ) burst_graph = generate_rds_burst_balance_graph( name=name, cloudwatch_data_source=cloudwatch_data_source, notifications=notifications, ) connections_graph = generate_rds_database_connections_graph( name=name, cloudwatch_data_source=cloudwatch_data_source) freeable_memory_graph = generate_rds_freeable_memory_graph( name=name, cloudwatch_data_source=cloudwatch_data_source) free_storage_graph = generate_rds_free_storage_space_graph( name=name, cloudwatch_data_source=cloudwatch_data_source) rows = [ Row(panels=[cpu_graph, burst_graph]), Row(panels=[ connections_graph, freeable_memory_graph, free_storage_graph ]), Row(panels=[ generate_rds_disk_latency_graph( name=name, cloudwatch_data_source=cloudwatch_data_source), generate_rds_disk_ops_graph( name=name, cloudwatch_data_source=cloudwatch_data_source), generate_rds_network_throughput_graph( name=name, cloudwatch_data_source=cloudwatch_data_source), ]), ] if engine == "postgres": rows += [ Row(panels=[ generate_rds_transaction_id_graph( name=name, cloudwatch_data_source=cloudwatch_data_source, notifications=notifications, ) ]) ] return Dashboard( title="RDS: {}".format(name), editable=EDITABLE, tags=tags, timezone=TIMEZONE, sharedCrosshair=SHARED_CROSSHAIR, rows=rows, links=[ get_documentation_link( "https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/MonitoringOverview.html" ) ], refresh=DEFAULT_REFRESH, ).auto_panel_ids()
from grafanalib.core import Dashboard, Graph, Row, Target from grr_grafanalib_dashboards.util import add_data_source from grr_grafanalib_dashboards.reusable_panels import GENERAL_PANELS from grr_grafanalib_dashboards.config import GRAFANA_DATA_SOURCE GRR_COMPONENT = "frontend" dashboard = Dashboard( title="{}s Dashboard".format(GRR_COMPONENT).title(), rows=[ Row(panels=[panel(GRR_COMPONENT) for panel in row]) for row in GENERAL_PANELS ] + [ Row(panels=[ Graph( title="QPS", targets=[ Target( expr='sum(rate(frontend_request_count_total[1m]))', legendFormat="Requests", ), ], ), Graph( title="Request Latency Rate", targets=[ Target( expr= 'sum(rate(frontend_request_latency_sum[10m])) / sum(rate(frontend_request_latency_count[10m]))', legendFormat="Latency", ),
def generate_elasticache_redis_dashboard( name: str, cache_cluster_id: str, influxdb_data_source: str, cloudwatch_data_source: str, environment: str, notifications: List[str], *args, **kwargs, ): """Generate ElastiCache Redis dashboard""" tags = ["elasticache", "redis", environment] rows = [ Row( panels=[ generate_elasticache_redis_cpu_usage_graph( cache_cluster_id=cache_cluster_id, cloudwatch_data_source=cloudwatch_data_source, ), generate_elasticache_redis_cpu_credit_usage_graph( cache_cluster_id=cache_cluster_id, cloudwatch_data_source=cloudwatch_data_source, notifications=notifications, ), generate_elasticache_redis_swap_and_memory_usage_graph( cache_cluster_id=cache_cluster_id, cloudwatch_data_source=cloudwatch_data_source, ), ], editable=EDITABLE, ), Row( panels=[ generate_elasticache_redis_network_in_graph( cache_cluster_id=cache_cluster_id, cloudwatch_data_source=cloudwatch_data_source, ), generate_elasticache_redis_connections_graph( cache_cluster_id=cache_cluster_id, cloudwatch_data_source=cloudwatch_data_source, ), generate_elasticache_redis_db_memory_usage_and_evicitons_graph( cache_cluster_id=cache_cluster_id, cloudwatch_data_source=cloudwatch_data_source, ), ], editable=EDITABLE, ), Row( panels=[ generate_elasticache_redis_network_out_graph( cache_cluster_id=cache_cluster_id, cloudwatch_data_source=cloudwatch_data_source, ), generate_elasticache_redis_replication_graph( cache_cluster_id=cache_cluster_id, cloudwatch_data_source=cloudwatch_data_source, ), generate_elasticache_redis_latency_graph( cache_cluster_id=cache_cluster_id, cloudwatch_data_source=cloudwatch_data_source, ), ], editable=EDITABLE, ), ] return Dashboard( title="ElastiCache Redis: {}".format(name), editable=EDITABLE, tags=tags, timezone=TIMEZONE, sharedCrosshair=SHARED_CROSSHAIR, rows=rows, links=[DOCUMENTATION_LINK], refresh=DEFAULT_REFRESH, ).auto_panel_ids()
Row(panels=[ Graph( title="Number of Active Processes", targets=[ Target( expr='sum(up{job="fleetspeak"})', legendFormat="Active Processes", ), ], alert=Alert( name="Number of Active Processes alert", message= "The number of active Fleetspeak Server processes is below {}" .format(ACTIVE_PROCESSES_ALERTING_CONDITION), alertConditions=[ AlertCondition( Target( expr='sum(up{job="fleetspeak"})', legendFormat="Active Processes", ), timeRange=TimeRange("10s", "now"), evaluator=LowerThan( ACTIVE_PROCESSES_ALERTING_CONDITION), operator=OP_AND, reducerType=RTYPE_SUM) ], )), Graph( title="Sum of Process Memory Bytes (across all instances)", targets=[ Target( expr= 'sum(process_resident_memory_bytes{job="fleetspeak"})', legendFormat="Resident Memory", ), ]), Graph( title="CPU Usage", targets=[ Target( expr= 'avg(rate(process_cpu_seconds_total{job="fleetspeak"}[30s])) * 100', legendFormat="Average Process CPU Usage", ), ], yAxes=YAxes(left=YAxis(max=105, format="percent")), ), ]),
from grafanalib.core import Dashboard, Graph, Row, Target, YAxes, YAxis, SECONDS_FORMAT, OPS_FORMAT from grr_grafanalib_dashboards.util import add_data_source from grr_grafanalib_dashboards.reusable_panels import GENERAL_PANELS from grr_grafanalib_dashboards.config import GRAFANA_DATA_SOURCE GRR_COMPONENT = "admin_ui" dashboard = Dashboard( title="{}s Dashboard".format(GRR_COMPONENT).title().replace("_", " "), rows=[ Row(panels=[panel(GRR_COMPONENT) for panel in row]) for row in GENERAL_PANELS ] + [ Row(panels=[ Graph( title="API Method Latency Rate by Method", targets=[ Target( expr= 'sum by (method_name) (rate(api_method_latency_sum[10m]) / rate(api_method_latency_count[10m]))', legendFormat="{{method_name}}", ), ], yAxes=YAxes(left=YAxis(format=SECONDS_FORMAT)), ), Graph( title="API Calls Count Rate with Status SUCCESS", targets=[ Target( expr= 'sum(rate(api_method_latency_count{status="SUCCESS"}[10m]))',
from grafanalib.core import Dashboard, Graph, Row, Target, YAxes, YAxis, SECONDS_FORMAT from grr_grafanalib_dashboards.util import add_data_source from grr_grafanalib_dashboards.reusable_panels import GENERAL_PANELS from grr_grafanalib_dashboards.config import GRAFANA_DATA_SOURCE GRR_COMPONENT = "frontend" dashboard = Dashboard( title="{}s Dashboard".format(GRR_COMPONENT).title(), rows=[ Row(panels=[panel(GRR_COMPONENT) for panel in row]) for row in GENERAL_PANELS ] + [ Row(panels=[ Graph( title="QPS", targets=[ Target( expr='sum(rate(frontend_request_count_total[1m]))', legendFormat="Requests", ), ], yAxes=YAxes(left=YAxis(format="reqps")), ), Graph( title="Request Latency", targets=[ Target( expr= 'sum(rate(frontend_request_latency_sum[10m])) / sum(rate(frontend_request_latency_count[10m]))', legendFormat="Latency",
def queue_metrics(collapse: bool) -> Row: return Row( title="FlytePropeller Queue metrics", collapse=collapse, panels=[ Graph( title="WF Adds to main queue", dataSource=DATASOURCE, targets=[ Target( expr= f'sum(rate(flyte:propeller:all:main_adds[5m]))', refId='A', ), ], yAxes=single_y_axis(format=SHORT_FORMAT), ), Graph( title="Unprocessed Queue depth", dataSource=DATASOURCE, targets=[ Target( expr= f'sum(rate(flyte:propeller:all:main_depth[5m]))', refId='A', ), ], yAxes=single_y_axis(format=SHORT_FORMAT), ), Graph( title="Item retries", dataSource=DATASOURCE, targets=[ Target( expr= f'sum(rate(flyte:propeller:all:main_retries[5m]))', refId='A', ), ], yAxes=single_y_axis(format=SHORT_FORMAT), ), Graph( title="Seconds of unfinished work in progress", dataSource=DATASOURCE, targets=[ Target( expr=f'flyte:propeller:all:main_unfinished_work_s', refId='A', ), ], yAxes=single_y_axis(format=SECONDS_FORMAT), ), Graph( title="Workqueue work average duration", dataSource=DATASOURCE, targets=[ Target( expr= f'sum(rate(flyte:propeller:all:main_work_duration_us_sum[5m]) / rate(flyte:propeller:all:main_work_duration_us_count[5m]))', refId='A', ), ], yAxes=single_y_axis(format=SECONDS_FORMAT), ), Graph( title="Duration for which an item stays in queue - avg", dataSource=DATASOURCE, targets=[ Target( expr= f'sum(rate(flyte:propeller:all:main_queue_latency_us_sum[5m]) / rate(flyte:propeller:all:main_queue_latency_us_count[5m]))', refId='A', ), ], yAxes=single_y_axis(format=SECONDS_FORMAT), ), ], )
def generate_api_gateways_dashboard( name: str, cloudwatch_data_source: str, lambda_insights_namespace: str, notifications: List[str], environment: str, lambdas: List[str], *args, **kwargs, ): tags = ["api-gateway", environment] if lambdas: tags = tags + ["lambda"] api_gateway_graph = generate_api_gateway_requests_graph( name, cloudwatch_data_source, notifications) rows = [ Row(title="API Gateway Metrics", showTitle=True, panels=[api_gateway_graph]) ] if lambdas: for lambda_fn in lambdas: lambda_metrics_row = Row( title="{} Lambda Metrics".format(lambda_fn), showTitle=True, collapse=False, panels=[ lambda_generate_invocations_graph(name, cloudwatch_data_source, notifications=[]), lambda_generate_duration_graph(name, cloudwatch_data_source), lambda_generate_memory_utilization_percentage_graph( name, cloudwatch_data_source, lambda_insights_namespace, notifications=notifications, ), lambda_generate_memory_utilization_graph( name, cloudwatch_data_source, lambda_insights_namespace), ], ) lambda_logs_row = Row( title="{} Lambda Logs".format(lambda_fn), showTitle=True, collapse=True, panels=[ lambda_generate_logs_panel(name, cloudwatch_data_source), ], ) rows.append(lambda_metrics_row) rows.append(lambda_logs_row) return Dashboard( title="{} {}".format("API Gateway:", name), editable=EDITABLE, tags=tags, timezone=TIMEZONE, sharedCrosshair=SHARED_CROSSHAIR, refresh=DEFAULT_REFRESH, rows=rows, ).auto_panel_ids()
from grafanalib.core import Dashboard, Graph, Row, Target from grr_grafanalib_dashboards.util import add_data_source from grr_grafanalib_dashboards.reusable_panels import GENERAL_PANELS from grr_grafanalib_dashboards.config import GRAFANA_DATA_SOURCE GRR_COMPONENT = "worker" dashboard = Dashboard( title="{}s Dashboard".format(GRR_COMPONENT).title(), rows=[ Row(panels=[panel(GRR_COMPONENT) for panel in row]) for row in GENERAL_PANELS ] + [ Row(panels=[ Graph( title="Successful Flows Rate vs. Failed Flows Rate", targets=[ Target( expr= 'sum(rate(flow_completions_total{job="grr_worker"}[10m]))', legendFormat="Successes", ), Target( expr= 'sum(rate(flow_errors_total{job="grr_worker"}[10m]))', legendFormat="Failures", ), ], ), Graph( title="Threadpool Latency Rate vs. Queuing Time Rate",