示例#1
0
 def condition():
     _, metric_names, metric_samples = fetch_prometheus([prom_addr])
     if metric_name in metric_names:
         for sample in metric_samples:
             if sample.name == metric_name and sample.value == n:
                 return True
     return False
示例#2
0
 def test_case_stats_exist():
     components_dict, metric_names, metric_samples = fetch_prometheus(prom_addresses)
     return all(
         [
             "ray_node_cpu_utilization" in metric_names,
             "ray_node_cpu_count" in metric_names,
             "ray_node_mem_used" in metric_names,
             "ray_node_mem_available" in metric_names,
             "ray_node_mem_total" in metric_names,
             "ray_raylet_cpu" in metric_names,
             "ray_raylet_mem" in metric_names,
             "ray_raylet_mem_uss" in metric_names
             if sys.platform == "linux"
             else True,
             "ray_node_disk_io_read" in metric_names,
             "ray_node_disk_io_write" in metric_names,
             "ray_node_disk_io_read_count" in metric_names,
             "ray_node_disk_io_write_count" in metric_names,
             "ray_node_disk_io_read_speed" in metric_names,
             "ray_node_disk_io_write_speed" in metric_names,
             "ray_node_disk_read_iops" in metric_names,
             "ray_node_disk_write_iops" in metric_names,
             "ray_node_disk_usage" in metric_names,
             "ray_node_disk_free" in metric_names,
             "ray_node_disk_utilization_percentage" in metric_names,
             "ray_node_network_sent" in metric_names,
             "ray_node_network_received" in metric_names,
             "ray_node_network_send_speed" in metric_names,
             "ray_node_network_receive_speed" in metric_names,
         ]
     )
示例#3
0
    def verify_used_object_store_memory(expected_mb):
        components_dict, metric_names, metric_samples = fetch_prometheus(
            [prom_addr])

        def in_mb(bytes):
            return int(bytes / 1024 / 1024)

        total_memory = in_mb(obj_store_memory)
        available_memory_sample = None
        used_memory_sample = None
        fallback_memory_sample = None

        for sample in metric_samples:
            if sample.name == "ray_object_store_available_memory":
                available_memory_sample = sample
            if sample.name == "ray_object_store_used_memory":
                used_memory_sample = sample
            if sample.name == "ray_object_store_fallback_memory":
                fallback_memory_sample = sample

        if not (available_memory_sample and used_memory_sample
                and fallback_memory_sample):
            return False

        avail_memory = in_mb(available_memory_sample.value)
        used_memory = in_mb(used_memory_sample.value)
        fallback_memory = in_mb(fallback_memory_sample.value)

        assert avail_memory == total_memory - used_memory
        assert used_memory == 400  # 400MB
        assert fallback_memory == 400
        return True
示例#4
0
 def test_worker_stats():
     _, metric_names, metric_samples = fetch_prometheus(prom_addresses)
     expected_metrics = ["ray_workers_cpu", "ray_workers_mem"]
     if sys.platform == "linux":
         expected_metrics.append("ray_workers_mem_uss")
     for metric in expected_metrics:
         if metric not in metric_names:
             raise RuntimeError(
                 f"Metric {metric} not found in exported metric names"
             )
     return True
示例#5
0
 def test_case_ip_correct():
     components_dict, metric_names, metric_samples = fetch_prometheus(prom_addresses)
     raylet_proc = ray._private.worker._global_node.all_processes[
         ray_constants.PROCESS_TYPE_RAYLET
     ][0]
     raylet_pid = None
     # Find the raylet pid recorded in the tag.
     for sample in metric_samples:
         if sample.name == "ray_raylet_cpu":
             raylet_pid = sample.labels["pid"]
             break
     return str(raylet_proc.process.pid) == str(raylet_pid)
示例#6
0
 def test_case_stats_exist():
     components_dict, metric_names, metric_samples = fetch_prometheus(
         prom_addresses)
     return all([
         "ray_node_cpu_utilization" in metric_names, "ray_node_cpu_count"
         in metric_names, "ray_node_mem_used" in metric_names,
         "ray_node_mem_available" in metric_names, "ray_node_mem_total"
         in metric_names, "ray_raylet_cpu" in metric_names, "ray_raylet_mem"
         in metric_names, "ray_node_disk_usage" in metric_names,
         "ray_node_disk_free" in metric_names,
         "ray_node_disk_utilization_percentage" in metric_names,
         "ray_node_network_sent" in metric_names,
         "ray_node_network_received" in metric_names,
         "ray_node_network_send_speed" in metric_names,
         "ray_node_network_receive_speed" in metric_names
     ])
示例#7
0
    def verify_metrics_not_collected():
        components_dict, metric_names, _ = fetch_prometheus(prom_addresses)
        # Make sure no component is reported.
        for _, comp in components_dict.items():
            if len(comp) > 0:
                print(
                    f"metrics from a component {comp} exists although it should not."
                )
                return False

        # Make sure metrics are not there.
        for metric in _METRICS + _AUTOSCALER_METRICS:
            if metric in metric_names:
                print("f{metric} exists although it should not.")
                return False
        return True
示例#8
0
    def test_cases():
        components_dict, metric_names, metric_samples = fetch_prometheus(
            prom_addresses)

        # Raylet should be on every node
        assert all("raylet" in components
                   for components in components_dict.values())

        # GCS server should be on one node
        assert any("gcs_server" in components
                   for components in components_dict.values())

        # Core worker should be on at least on node
        assert any("core_worker" in components
                   for components in components_dict.values())

        # Make sure our user defined metrics exist
        for metric_name in [
                "test_counter", "test_histogram", "test_driver_counter"
        ]:
            assert any(metric_name in full_name for full_name in metric_names)

        # Make sure metrics are recorded.
        for metric in _METRICS:
            assert metric in metric_names, \
                f"metric {metric} not in {metric_names}"

        # Make sure the numeric values are correct
        test_counter_sample = [
            m for m in metric_samples if "test_counter" in m.name
        ][0]
        assert test_counter_sample.value == 4.0

        test_driver_counter_sample = [
            m for m in metric_samples if "test_driver_counter" in m.name
        ][0]
        assert test_driver_counter_sample.value == 1.0

        test_histogram_samples = [
            m for m in metric_samples if "test_histogram" in m.name
        ]
        buckets = {
            m.labels["le"]: m.value
            for m in test_histogram_samples if "_bucket" in m.name
        }
        # We recorded value 1.5 for the histogram. In Prometheus data model
        # the histogram is cumulative. So we expect the count to appear in
        # <1.1 and <+Inf buckets.
        assert buckets == {"0.1": 0.0, "1.6": 1.0, "+Inf": 1.0}
        hist_count = [m for m in test_histogram_samples
                      if "_count" in m.name][0].value
        hist_sum = [m for m in test_histogram_samples
                    if "_sum" in m.name][0].value
        assert hist_count == 1
        assert hist_sum == 1.5

        # Autoscaler metrics
        _, autoscaler_metric_names, _ = fetch_prometheus(
            [autoscaler_export_addr])
        for metric in _AUTOSCALER_METRICS:
            # Metric name should appear with some suffix (_count, _total,
            # etc...) in the list of all names
            assert any(name.startswith(metric) for name in
                       autoscaler_metric_names), \
                    f"{metric} not in {autoscaler_metric_names}"