def test_delete_objects(object_spilling_config, shutdown_only): # Limit our object store to 75 MiB of memory. object_spilling_config, temp_folder = object_spilling_config address = ray.init(object_store_memory=75 * 1024 * 1024, _system_config={ "max_io_workers": 1, "min_spilling_size": 0, "automatic_object_spilling_enabled": True, "object_store_full_delay_ms": 100, "object_spilling_config": object_spilling_config, }) arr = np.random.rand(1024 * 1024) # 8 MB data replay_buffer = [] for _ in range(80): ref = None while ref is None: ref = ray.put(arr) replay_buffer.append(ref) print("-----------------------------------") del replay_buffer del ref wait_for_condition(lambda: is_dir_empty(temp_folder)) assert_no_thrashing(address["redis_address"])
def test_delete_objects_on_worker_failure(object_spilling_config, shutdown_only): # Limit our object store to 75 MiB of memory. object_spilling_config, temp_folder = object_spilling_config address = ray.init( object_store_memory=75 * 1024 * 1024, _system_config={ "max_io_workers": 4, "automatic_object_spilling_enabled": True, "object_store_full_delay_ms": 100, "object_spilling_config": object_spilling_config, "min_spilling_size": 0, }, ) arr = np.random.rand(1024 * 1024) # 8 MB data @ray.remote class Actor: def __init__(self): self.replay_buffer = [] def get_pid(self): return os.getpid() def create_objects(self): for _ in range(80): ref = None while ref is None: ref = ray.put(arr) self.replay_buffer.append(ref) # Remove the replay buffer with 60% probability. if random.randint(0, 9) < 6: self.replay_buffer.pop() # Do random sampling. for _ in range(200): ref = random.choice(self.replay_buffer) sample = ray.get(ref, timeout=0) assert np.array_equal(sample, arr) a = Actor.remote() actor_pid = ray.get(a.get_pid.remote()) ray.get(a.create_objects.remote()) os.kill(actor_pid, 9) def wait_until_actor_dead(): try: ray.get(a.get_pid.remote()) except ray.exceptions.RayActorError: return True return False wait_for_condition(wait_until_actor_dead) # After all, make sure all objects are deleted upon worker failures. wait_for_condition(lambda: is_dir_empty(temp_folder)) assert_no_thrashing(address["address"])
def test_spill_dir_cleanup_on_raylet_start(object_spilling_config): object_spilling_config, temp_folder = object_spilling_config cluster = Cluster() cluster.add_node( num_cpus=0, object_store_memory=75 * 1024 * 1024, _system_config={"object_spilling_config": object_spilling_config}, ) ray.init(address=cluster.address) node2 = cluster.add_node(num_cpus=1, object_store_memory=75 * 1024 * 1024) # This task will run on node 2 because node 1 has no CPU resource @ray.remote(num_cpus=1) def run_workload(): ids = [] for _ in range(2): arr = np.random.rand(5 * 1024 * 1024) # 40 MB ids.append(ray.put(arr)) return ids ids = ray.get(run_workload.remote()) assert not is_dir_empty(temp_folder) # Kill node 2 cluster.remove_node(node2) # Verify that the spill folder is not empty assert not is_dir_empty(temp_folder) # Start a new node cluster.add_node(num_cpus=1, object_store_memory=75 * 1024 * 1024) # Verify that the spill folder is now cleaned up assert is_dir_empty(temp_folder) # We hold the object refs to prevent them from being deleted del ids ray.shutdown() cluster.shutdown()
def test_file_deleted_when_driver_exits(tmp_path, shutdown_only): temp_folder = tmp_path / "spill" temp_folder.mkdir() driver = """ import json import os import signal import numpy as np import ray ray.init( object_store_memory=75 * 1024 * 1024, _system_config={{ "max_io_workers": 2, "min_spilling_size": 0, "automatic_object_spilling_enabled": True, "object_store_full_delay_ms": 100, "object_spilling_config": json.dumps({{ "type": "filesystem", "params": {{ "directory_path": "{temp_dir}" }} }}), }}) arr = np.random.rand(1024 * 1024) # 8 MB data replay_buffer = [] # Spill lots of objects for _ in range(30): ref = None while ref is None: ref = ray.put(arr) replay_buffer.append(ref) # Send sigterm to itself. signum = {signum} sig = None if signum == 2: sig = signal.SIGINT elif signum == 15: sig = signal.SIGTERM os.kill(os.getpid(), sig) """ # Run a driver with sigint. print("Sending sigint...") with pytest.raises(subprocess.CalledProcessError): print( run_string_as_driver( driver.format(temp_dir=str(temp_folder), signum=2))) wait_for_condition(lambda: is_dir_empty(temp_folder, append_path=""))
def test_delete_objects_delete_while_creating(object_spilling_config, shutdown_only): # Limit our object store to 75 MiB of memory. object_spilling_config, temp_folder = object_spilling_config address = ray.init( object_store_memory=75 * 1024 * 1024, _system_config={ "max_io_workers": 4, "min_spilling_size": 0, "automatic_object_spilling_enabled": True, "object_store_full_delay_ms": 100, "object_spilling_config": object_spilling_config, }, ) arr = np.random.rand(1024 * 1024) # 8 MB data replay_buffer = [] for _ in range(80): ref = None while ref is None: ref = ray.put(arr) replay_buffer.append(ref) # Remove the replay buffer with 60% probability. if random.randint(0, 9) < 6: replay_buffer.pop() # Do random sampling. for _ in range(200): ref = random.choice(replay_buffer) sample = ray.get(ref, timeout=0) assert np.array_equal(sample, arr) # After all, make sure all objects are killed without race condition. del replay_buffer del ref wait_for_condition(lambda: is_dir_empty(temp_folder)) assert_no_thrashing(address["address"])
def test_delete_objects_multi_node(multi_node_object_spilling_config, ray_start_cluster): # Limit our object store to 75 MiB of memory. object_spilling_config, temp_folder = multi_node_object_spilling_config cluster = ray_start_cluster # Head node. cluster.add_node(num_cpus=1, object_store_memory=75 * 1024 * 1024, _system_config={ "max_io_workers": 2, "min_spilling_size": 20 * 1024 * 1024, "automatic_object_spilling_enabled": True, "object_store_full_delay_ms": 100, "object_spilling_config": object_spilling_config, }) ray.init(address=cluster.address) # Add 2 worker nodes. for _ in range(2): cluster.add_node(num_cpus=1, object_store_memory=75 * 1024 * 1024) cluster.wait_for_nodes() arr = np.random.rand(1024 * 1024) # 8 MB data @ray.remote(num_cpus=1) class Actor: def __init__(self): self.replay_buffer = [] def ping(self): return def create_objects(self): for _ in range(80): ref = None while ref is None: ref = ray.put(arr) self.replay_buffer.append(ref) # Remove the replay buffer with 60% probability. if random.randint(0, 9) < 6: self.replay_buffer.pop() # Do random sampling. for _ in range(50): ref = random.choice(self.replay_buffer) sample = ray.get(ref, timeout=10) assert np.array_equal(sample, arr) actors = [Actor.remote() for _ in range(3)] ray.get([actor.create_objects.remote() for actor in actors]) def wait_until_actor_dead(actor): try: ray.get(actor.ping.remote()) except ray.exceptions.RayActorError: return True return False # Kill actors to remove all references. for actor in actors: ray.kill(actor) wait_for_condition(lambda: wait_until_actor_dead(actor)) # The multi node deletion should work. wait_for_condition(lambda: is_dir_empty(temp_folder)) assert_no_thrashing(cluster.address)
def test_multiple_directories(tmp_path, shutdown_only): num_dirs = 3 temp_dirs = [] for i in range(num_dirs): temp_folder = tmp_path / f"spill_{i}" temp_folder.mkdir() temp_dirs.append(temp_folder) # Limit our object store to 75 MiB of memory. min_spilling_size = 0 object_spilling_config = json.dumps({ "type": "filesystem", "params": { "directory_path": [str(directory) for directory in temp_dirs] } }) address = ray.init( object_store_memory=75 * 1024 * 1024, _system_config={ "max_io_workers": 5, "object_store_full_delay_ms": 100, "object_spilling_config": object_spilling_config, "min_spilling_size": min_spilling_size, }) arr = np.ones(74 * 1024 * 1024, dtype=np.uint8) # 74MB. object_refs = [] # Now the storage is full. object_refs.append(ray.put(arr)) num_object_spilled = 20 for _ in range(num_object_spilled): object_refs.append(ray.put(arr)) num_files = defaultdict(int) for temp_dir in temp_dirs: temp_folder = temp_dir / ray.ray_constants.DEFAULT_OBJECT_PREFIX for path in temp_folder.iterdir(): num_files[str(temp_folder)] += 1 for ref in object_refs: assert np.array_equal(ray.get(ref), arr) print("Check distribution...") min_count = 5 is_distributed = [n_files >= min_count for n_files in num_files.values()] assert all(is_distributed) print("Check deletion...") # Empty object refs. object_refs = [] # Add a new object so that the last entry is evicted. ref = ray.put(arr) for temp_dir in temp_dirs: temp_folder = temp_dir wait_for_condition(lambda: is_dir_empty(temp_folder)) assert_no_thrashing(address["redis_address"]) # Now kill ray and see all directories are deleted. print("Check directories are deleted...") ray.shutdown() for temp_dir in temp_dirs: wait_for_condition(lambda: is_dir_empty(temp_dir, append_path=""))