d.Graph( title="coredns: # running", targets=[ d.Target( expr= 'count(container_memory_usage_bytes{namespace="kube-system", container="coredns"}) by (container, namespace)' ) ], nullPointMode="null", ), d.Graph( title="coredns: memory usage", targets=d.min_max_avg( base= 'process_resident_memory_bytes{namespace="kube-system", job="kube-dns"}', by=["job", "namespace"], legend="{{job}}", ), nullPointMode="null", yAxes=g.single_y_axis(format=g.BYTES_FORMAT), ), ] dashboard = d.Dashboard( title="DNS", rows=[ d.Row(title="In-cluster DNS latency", panels=DNS_LATENCY_PANEL), d.Row(title="CoreDNS", panels=COREDNS_PANELS), ], ).auto_panel_ids()
], yAxes=g.single_y_axis(format=g.SHORT_FORMAT, logBase=10), ), ] # The final dashboard must be named 'dashboard' so that grafanalib will find it. dashboard = d.Dashboard( title="Master dashboard", refresh="", rows=[ d.Row(title="Clusterloader", panels=CLUSTERLOADER_PANELS), d.Row(title="Overall cluster health", panels=HEALTH_PANELS, collapse=True), d.Row(title="etcd", panels=ETCD_PANELS, collapse=True), d.Row(title="kube-apiserver", panels=APISERVER_PANELS, collapse=True), d.Row( title="kube-controller-manager", panels=[ d.simple_graph( "Workqueue depths", 'workqueue_depth{endpoint="kube-controller-manager"}', legend="{{name}}", ) ], collapse=True, ), d.Row(title="Master VM", panels=VM_PANELS, collapse=True), ], ).auto_panel_ids()
d.Graph( title="Service: # running", targets=[ d.Target( expr= 'count(process_resident_memory_bytes{namespace="kube-system", job="kube-dns"}) by (job, namespace)' ) ], nullPointMode="null", ), d.Graph( title="Service: memory usage", targets=d.min_max_avg( base= 'process_resident_memory_bytes{namespace="kube-system", job="kube-dns"}', by=["job", "namespace"], legend="{{job}}", ), nullPointMode="null", yAxes=g.single_y_axis(format=g.BYTES_FORMAT), ), ] dashboard = d.Dashboard( title="DNS", rows=[ d.Row(title="In-cluster DNS prober", panels=PROBER_PANEL), d.Row(title="In-cluster DNS service", panels=SERVICE_PANELS), ], ).auto_panel_ids()
dashboard = d.Dashboard( title="Master dashboard", rows=[ d.Row(title="Clusterloader", panels=CLUSTERLOADER_PANELS), d.Row(title="Overall cluster health", panels=HEALTH_PANELS), d.Row(title="etcd", panels=ETCD_PANELS), d.Row(title="kube-apiserver", panels=APISERVER_PANELS), d.Row( title="kube-controller-manager", panels=[ d.simple_graph( "Workqueue depths", 'workqueue_depth{endpoint="kube-controller-manager"}', legend="{{name}}", ) ], ), d.Row(title="Master VM", panels=VM_PANELS), d.Row( title="Addons", panels=[ d.Graph( title="Coredns memory", dataSource="$source", targets=[ g.Target( expr= 'quantile(1, sum(process_resident_memory_bytes{job="kube-dns"}) by (pod))', legendFormat="coredns-mem-100pctl", ), g.Target( expr= 'quantile(0.99, sum(process_resident_memory_bytes{job="kube-dns"}) by (pod))', legendFormat="coredns-mem-99ctl", ), g.Target( expr= 'quantile(0.90, sum(process_resident_memory_bytes{job="kube-dns"}) by (pod))', legendFormat="coredns-mem-90ctl", ), g.Target( expr= 'quantile(0.50, sum(process_resident_memory_bytes{job="kube-dns"}) by (pod))', legendFormat="coredns-mem-50ctl", ), ], yAxes=g.single_y_axis(format=g.BYTES_FORMAT), ) ], ), ], ).auto_panel_ids()
dashboard = d.Dashboard( title="Comparison Master dashboard", refresh="", rows=[ d.Row(title="API call latency", panels=extended_copy(API_CALL_LATENCY_PANELS)), d.Row(title="API call latency aggregated with quantile", panels=extended_copy(QUANTILE_API_CALL_LATENCY_PANELS), collapse=True), d.Row(title="P&F metrics", panels=extended_copy(PAF_PANELS), collapse=True), d.Row(title="Overall cluster health", panels=extended_copy(HEALTH_PANELS), collapse=True), d.Row(title="etcd", panels=extended_copy(ETCD_PANELS), collapse=True), d.Row(title="kube-apiserver", panels=extended_copy(APISERVER_PANELS), collapse=True), d.Row(title="kube-controller-manager", panels=extended_copy(CONTROLLER_MANAGER_PANELS), collapse=True), d.Row(title="Master VM", panels=extended_copy(VM_PANELS), collapse=True), ], templating=g.Templating( list=[ d.SOURCE_TEMPLATE, g.Template( name="secondary_source", type="datasource", query="prometheus", ), g.Template( name="timeshift", type="interval", query="", ), g.Template( name="etcd_type", type="query", dataSource="$source", regex=r"\*\[+\]+(.*)", query="label_values(etcd_request_duration_seconds_count, type)", multi=True, includeAll=True, refresh=g.REFRESH_ON_TIME_RANGE_CHANGE, ), g.Template( name="etcd_operation", type="query", dataSource="$source", query="label_values(etcd_request_duration_seconds_count, operation)", multi=True, includeAll=True, refresh=g.REFRESH_ON_TIME_RANGE_CHANGE, ), g.Template( name="verb", type="query", dataSource="$source", query="label_values(apiserver_request_duration_seconds_count, verb)", multi=True, includeAll=True, refresh=g.REFRESH_ON_TIME_RANGE_CHANGE, ), g.Template( name="resource", type="query", dataSource="$source", regex="(.*)s", query="label_values(apiserver_request_duration_seconds_count, resource)", multi=True, includeAll=True, refresh=g.REFRESH_ON_TIME_RANGE_CHANGE, ), ] ), ).auto_panel_ids()
dashboard = d.Dashboard( title="Master dashboard", refresh="", rows=[ d.Row(title="API call latency", panels=API_CALL_LATENCY_PANELS), d.Row(title="API call latency aggregated with quantile", panels=QUANTILE_API_CALL_LATENCY_PANELS, collapse=True), d.Row(title="Overall cluster health", panels=HEALTH_PANELS, collapse=True), d.Row(title="etcd", panels=ETCD_PANELS, collapse=True), d.Row(title="kube-apiserver", panels=APISERVER_PANELS, collapse=True), d.Row( title="kube-controller-manager", panels=[ d.simple_graph( "Workqueue depths", 'workqueue_depth{endpoint="kube-controller-manager"}', legend="{{name}}", ) ], collapse=True, ), d.Row(title="Master VM", panels=VM_PANELS, collapse=True), ], templating=g.Templating(list=[ d.SOURCE_TEMPLATE, g.Template( name="etcd_type", type="query", dataSource="$source", regex=r"\*\[+\]+(.*)", query="label_values(etcd_request_duration_seconds_count, type)", multi=True, includeAll=True, refresh=g.REFRESH_ON_TIME_RANGE_CHANGE, ), g.Template( name="etcd_operation", type="query", dataSource="$source", query= "label_values(etcd_request_duration_seconds_count, operation)", multi=True, includeAll=True, refresh=g.REFRESH_ON_TIME_RANGE_CHANGE, ), g.Template( name="verb", type="query", dataSource="$source", query= "label_values(apiserver_request_duration_seconds_count, verb)", multi=True, includeAll=True, refresh=g.REFRESH_ON_TIME_RANGE_CHANGE, ), g.Template( name="resource", type="query", dataSource="$source", regex="(.*)s", query= "label_values(apiserver_request_duration_seconds_count, resource)", multi=True, includeAll=True, refresh=g.REFRESH_ON_TIME_RANGE_CHANGE, ), ]), ).auto_panel_ids()
api_call_latency( title="Read-only API call latency (scope=cluster, threshold=30s)", metric=metric, verb="LIST", scope="cluster", threshold=30, ), api_call_latency( title="Mutating API call latency (threshold=1s)", metric=metric, verb=d.any_of("CREATE", "DELETE", "PATCH", "POST", "PUT"), scope=d.any_of("namespace", "cluster"), threshold=1, ), ] # The final dashboard must be named 'dashboard' so that grafanalib will find it. dashboard = d.Dashboard( title="SLO", rows=[ d.Row(title="SLO", panels=create_slo_panel()), d.Row( title="Experimental: SLO (window 1m)", panels=create_slo_panel( metric="apiserver:apiserver_request_latency_1m:histogram_quantile" ), ), ], ).auto_panel_ids()
nullPointMode="null", ), d.Graph( title="probes: memory usage", targets=[ d.Target( expr='min(container_memory_usage_bytes{namespace="probes", container=~"ping-client|ping-server"}) by (container)', legendFormat="min {{container}}", ), d.Target( expr='avg(container_memory_usage_bytes{namespace="probes", container=~"ping-client|ping-server"}) by (container)', legendFormat="avg {{container}}", ), d.Target( expr='max(container_memory_usage_bytes{namespace="probes", container=~"ping-client|ping-server"}) by (container)', legendFormat="max {{container}}", ), ], nullPointMode="null", ), ] dashboard = d.Dashboard( title="Network", rows=[ d.Row(title="Network progamming latency", panels=NETWORK_PROGRAMMING_PANEL), d.Row(title="In-cluster network latency", panels=NETWORK_LATENCY_PANEL), ], ).auto_panel_ids()