def condition(): _, metric_names, metric_samples = fetch_prometheus([prom_addr]) if metric_name in metric_names: for sample in metric_samples: if sample.name == metric_name and sample.value == n: return True return False
def test_case_stats_exist(): components_dict, metric_names, metric_samples = fetch_prometheus(prom_addresses) return all( [ "ray_node_cpu_utilization" in metric_names, "ray_node_cpu_count" in metric_names, "ray_node_mem_used" in metric_names, "ray_node_mem_available" in metric_names, "ray_node_mem_total" in metric_names, "ray_raylet_cpu" in metric_names, "ray_raylet_mem" in metric_names, "ray_raylet_mem_uss" in metric_names if sys.platform == "linux" else True, "ray_node_disk_io_read" in metric_names, "ray_node_disk_io_write" in metric_names, "ray_node_disk_io_read_count" in metric_names, "ray_node_disk_io_write_count" in metric_names, "ray_node_disk_io_read_speed" in metric_names, "ray_node_disk_io_write_speed" in metric_names, "ray_node_disk_read_iops" in metric_names, "ray_node_disk_write_iops" in metric_names, "ray_node_disk_usage" in metric_names, "ray_node_disk_free" in metric_names, "ray_node_disk_utilization_percentage" in metric_names, "ray_node_network_sent" in metric_names, "ray_node_network_received" in metric_names, "ray_node_network_send_speed" in metric_names, "ray_node_network_receive_speed" in metric_names, ] )
def verify_used_object_store_memory(expected_mb): components_dict, metric_names, metric_samples = fetch_prometheus( [prom_addr]) def in_mb(bytes): return int(bytes / 1024 / 1024) total_memory = in_mb(obj_store_memory) available_memory_sample = None used_memory_sample = None fallback_memory_sample = None for sample in metric_samples: if sample.name == "ray_object_store_available_memory": available_memory_sample = sample if sample.name == "ray_object_store_used_memory": used_memory_sample = sample if sample.name == "ray_object_store_fallback_memory": fallback_memory_sample = sample if not (available_memory_sample and used_memory_sample and fallback_memory_sample): return False avail_memory = in_mb(available_memory_sample.value) used_memory = in_mb(used_memory_sample.value) fallback_memory = in_mb(fallback_memory_sample.value) assert avail_memory == total_memory - used_memory assert used_memory == 400 # 400MB assert fallback_memory == 400 return True
def test_worker_stats(): _, metric_names, metric_samples = fetch_prometheus(prom_addresses) expected_metrics = ["ray_workers_cpu", "ray_workers_mem"] if sys.platform == "linux": expected_metrics.append("ray_workers_mem_uss") for metric in expected_metrics: if metric not in metric_names: raise RuntimeError( f"Metric {metric} not found in exported metric names" ) return True
def test_case_ip_correct(): components_dict, metric_names, metric_samples = fetch_prometheus(prom_addresses) raylet_proc = ray._private.worker._global_node.all_processes[ ray_constants.PROCESS_TYPE_RAYLET ][0] raylet_pid = None # Find the raylet pid recorded in the tag. for sample in metric_samples: if sample.name == "ray_raylet_cpu": raylet_pid = sample.labels["pid"] break return str(raylet_proc.process.pid) == str(raylet_pid)
def test_case_stats_exist(): components_dict, metric_names, metric_samples = fetch_prometheus( prom_addresses) return all([ "ray_node_cpu_utilization" in metric_names, "ray_node_cpu_count" in metric_names, "ray_node_mem_used" in metric_names, "ray_node_mem_available" in metric_names, "ray_node_mem_total" in metric_names, "ray_raylet_cpu" in metric_names, "ray_raylet_mem" in metric_names, "ray_node_disk_usage" in metric_names, "ray_node_disk_free" in metric_names, "ray_node_disk_utilization_percentage" in metric_names, "ray_node_network_sent" in metric_names, "ray_node_network_received" in metric_names, "ray_node_network_send_speed" in metric_names, "ray_node_network_receive_speed" in metric_names ])
def verify_metrics_not_collected(): components_dict, metric_names, _ = fetch_prometheus(prom_addresses) # Make sure no component is reported. for _, comp in components_dict.items(): if len(comp) > 0: print( f"metrics from a component {comp} exists although it should not." ) return False # Make sure metrics are not there. for metric in _METRICS + _AUTOSCALER_METRICS: if metric in metric_names: print("f{metric} exists although it should not.") return False return True
def test_cases(): components_dict, metric_names, metric_samples = fetch_prometheus( prom_addresses) # Raylet should be on every node assert all("raylet" in components for components in components_dict.values()) # GCS server should be on one node assert any("gcs_server" in components for components in components_dict.values()) # Core worker should be on at least on node assert any("core_worker" in components for components in components_dict.values()) # Make sure our user defined metrics exist for metric_name in [ "test_counter", "test_histogram", "test_driver_counter" ]: assert any(metric_name in full_name for full_name in metric_names) # Make sure metrics are recorded. for metric in _METRICS: assert metric in metric_names, \ f"metric {metric} not in {metric_names}" # Make sure the numeric values are correct test_counter_sample = [ m for m in metric_samples if "test_counter" in m.name ][0] assert test_counter_sample.value == 4.0 test_driver_counter_sample = [ m for m in metric_samples if "test_driver_counter" in m.name ][0] assert test_driver_counter_sample.value == 1.0 test_histogram_samples = [ m for m in metric_samples if "test_histogram" in m.name ] buckets = { m.labels["le"]: m.value for m in test_histogram_samples if "_bucket" in m.name } # We recorded value 1.5 for the histogram. In Prometheus data model # the histogram is cumulative. So we expect the count to appear in # <1.1 and <+Inf buckets. assert buckets == {"0.1": 0.0, "1.6": 1.0, "+Inf": 1.0} hist_count = [m for m in test_histogram_samples if "_count" in m.name][0].value hist_sum = [m for m in test_histogram_samples if "_sum" in m.name][0].value assert hist_count == 1 assert hist_sum == 1.5 # Autoscaler metrics _, autoscaler_metric_names, _ = fetch_prometheus( [autoscaler_export_addr]) for metric in _AUTOSCALER_METRICS: # Metric name should appear with some suffix (_count, _total, # etc...) in the list of all names assert any(name.startswith(metric) for name in autoscaler_metric_names), \ f"{metric} not in {autoscaler_metric_names}"