def standard_queries(namespace, cpu_lim=50, mem_lim=64): """Standard queries that should be run against all tests.""" return [ Query( '%s: 5xx Requests/s' % namespace, 'sum(rate(istio_requests_total{reporter="destination", destination_service_namespace=~"%s", response_code=~"5.."}[1m]))' % namespace, Alarm(lambda error_rate: error_rate > 0, 'There were 5xx errors.'), None), Query( '%s: Envoy CPU' % namespace, 'rate(container_cpu_usage_seconds_total{container_name="istio-proxy", namespace=~"%s"}[1m]) * %f' % (namespace, CPU_MILLI), Alarm(lambda cpu: cpu > cpu_lim, 'Envoy CPU is unexpectedly high.'), None), Query( '%s: Envoy Memory' % namespace, 'max(max_over_time(container_memory_usage_bytes{container_name="istio-proxy", namespace=~"%s"}[1m])) * %f' % (namespace, MEM_MB), Alarm(lambda mem: mem > mem_lim, 'Envoy memory is unexpectedly high.'), None), # TODO find a way to get average over time, otherwise this will be flakey and miss real issues. # Query( # '%s: CDS Convergence' % namespace, # 'count(count_values("value", envoy_cluster_manager_cds_version{namespace="%s"}))' % namespace, # Alarm( # lambda activeVersions: activeVersions > 1, # 'CDS has multiple versions running' # ) # ), ]
def test_pilot(self): queries = [ Query( "Pilot: XDS rejections", 'pilot_total_xds_rejects', Alarm(lambda errors: errors > 0, 'There should not be any rejected XDS pushes'), None) ] self.run_queries(queries)
def test_graceful_shutdown(self): queries = [ Query( 'Graceful Shutdown: 5xx Requests/s', 'sum(rate(istio_requests_total{destination_service="httpbin.graceful-shutdown.svc.cluster.local", source_app="client", response_code=~"5.."}[10m]))', Alarm( lambda error_rate: error_rate > 0, 'There were 5xx errors. Requests may be getting dropped.') ), Query( 'Graceful Shutdown: Total Requests/s', 'sum(rate(istio_requests_total{destination_service="httpbin.graceful-shutdown.svc.cluster.local", source_app="client"}[10m]))', Alarm( lambda qps: qps < 18, 'Not enough requests sent; expect at least 18. Service may be having issues.' )), ] self.run_queries(queries)
def stability_query(source, test): total = 'sum(rate(stability_outgoing_requests_total{source="%s"}[5m]))' % source failure = 'sum(rate(stability_outgoing_requests_total{source="%s", succeeded="False"}[5m]))' % source query = Query( '{}: error rate'.format(test), '{}/{}'.format(failure, total), Alarm(lambda errs: errs > 0, 'Error rate too high, expected no errors'), 'sum(stability_test_instances{test="%s"})' % test) return query
def test_redis(self): queries = [ Query( 'Redis: error rate', 'sum(rate(stability_outgoing_requests_total{source="redis-client", succeeded="False"}[5m]))/sum(rate(stability_outgoing_requests_total{source="redis-client"}[5m]))', Alarm(lambda errs: errs > 0, 'Error rate too high, expected no errors'), 'sum(stability_test_instances{test="redis"})') ] self.run_queries(queries)
def istio_requests_sanity(namespace): """Ensure that there are some requests to the namespace as a sanity check. This won't work for tests which don't report requests through Istio.""" return Query( '%s: Total Requests/s (sanity check)' % namespace, 'sum(rate(istio_requests_total{destination_service_namespace="%s"}[10m]))' % namespace, Alarm( lambda qps: qps < 0.5, 'There were no requests, the test is likely not running properly.' ), None)
def test_external_traffic(self): queries = [ Query( 'External Traffic: Total requests', 'sum(rate(istio_requests_total{destination_service="fortio-server.allow-external-traffic-b.svc.cluster.local"}[10m]))', Alarm( lambda qps: qps < 250, 'Not enough requests sent; expect at least 250. Service may be having issues.' )) # Cross namespace metrics are not recorded ] self.run_queries(queries)