Пример #1
0
def standard_queries(namespace, cpu_lim=50, mem_lim=64):
    """Standard queries that should be run against all tests."""
    return [
        Query(
            '%s: 5xx Requests/s' % namespace,
            'sum(rate(istio_requests_total{reporter="destination", destination_service_namespace=~"%s", response_code=~"5.."}[1m]))'
            % namespace,
            Alarm(lambda error_rate: error_rate > 0,
                  'There were 5xx errors.'), None),
        Query(
            '%s: Envoy CPU' % namespace,
            'rate(container_cpu_usage_seconds_total{container_name="istio-proxy", namespace=~"%s"}[1m]) * %f'
            % (namespace, CPU_MILLI),
            Alarm(lambda cpu: cpu > cpu_lim,
                  'Envoy CPU is unexpectedly high.'), None),
        Query(
            '%s: Envoy Memory' % namespace,
            'max(max_over_time(container_memory_usage_bytes{container_name="istio-proxy", namespace=~"%s"}[1m])) * %f'
            % (namespace, MEM_MB),
            Alarm(lambda mem: mem > mem_lim,
                  'Envoy memory is unexpectedly high.'), None),
        # TODO find a way to get average over time, otherwise this will be flakey and miss real issues.
        # Query(
        #     '%s: CDS Convergence' % namespace,
        #     'count(count_values("value", envoy_cluster_manager_cds_version{namespace="%s"}))' % namespace,
        #     Alarm(
        #         lambda activeVersions: activeVersions > 1,
        #         'CDS has multiple versions running'
        #     )
        # ),
    ]
Пример #2
0
 def test_pilot(self):
     queries = [
         Query(
             "Pilot: XDS rejections", 'pilot_total_xds_rejects',
             Alarm(lambda errors: errors > 0,
                   'There should not be any rejected XDS pushes'), None)
     ]
     self.run_queries(queries)
Пример #3
0
 def test_graceful_shutdown(self):
     queries = [
         Query(
             'Graceful Shutdown: 5xx Requests/s',
             'sum(rate(istio_requests_total{destination_service="httpbin.graceful-shutdown.svc.cluster.local", source_app="client", response_code=~"5.."}[10m]))',
             Alarm(
                 lambda error_rate: error_rate > 0,
                 'There were 5xx errors. Requests may be getting dropped.')
         ),
         Query(
             'Graceful Shutdown: Total Requests/s',
             'sum(rate(istio_requests_total{destination_service="httpbin.graceful-shutdown.svc.cluster.local", source_app="client"}[10m]))',
             Alarm(
                 lambda qps: qps < 18,
                 'Not enough requests sent; expect at least 18. Service may be having issues.'
             )),
     ]
     self.run_queries(queries)
Пример #4
0
def stability_query(source, test):
    total = 'sum(rate(stability_outgoing_requests_total{source="%s"}[5m]))' % source
    failure = 'sum(rate(stability_outgoing_requests_total{source="%s", succeeded="False"}[5m]))' % source
    query = Query(
        '{}: error rate'.format(test), '{}/{}'.format(failure, total),
        Alarm(lambda errs: errs > 0,
              'Error rate too high, expected no errors'),
        'sum(stability_test_instances{test="%s"})' % test)
    return query
Пример #5
0
 def test_redis(self):
     queries = [
         Query(
             'Redis: error rate',
             'sum(rate(stability_outgoing_requests_total{source="redis-client", succeeded="False"}[5m]))/sum(rate(stability_outgoing_requests_total{source="redis-client"}[5m]))',
             Alarm(lambda errs: errs > 0,
                   'Error rate too high, expected no errors'),
             'sum(stability_test_instances{test="redis"})')
     ]
     self.run_queries(queries)
Пример #6
0
def istio_requests_sanity(namespace):
    """Ensure that there are some requests to the namespace as a sanity check.
    This won't work for tests which don't report requests through Istio."""
    return Query(
        '%s: Total Requests/s (sanity check)' % namespace,
        'sum(rate(istio_requests_total{destination_service_namespace="%s"}[10m]))'
        % namespace,
        Alarm(
            lambda qps: qps < 0.5,
            'There were no requests, the test is likely not running properly.'
        ), None)
Пример #7
0
 def test_external_traffic(self):
     queries = [
         Query(
             'External Traffic: Total requests',
             'sum(rate(istio_requests_total{destination_service="fortio-server.allow-external-traffic-b.svc.cluster.local"}[10m]))',
             Alarm(
                 lambda qps: qps < 250,
                 'Not enough requests sent; expect at least 250. Service may be having issues.'
             ))
         # Cross namespace metrics are not recorded
     ]
     self.run_queries(queries)