def test_delete_objects(object_spilling_config, shutdown_only):
    # Limit our object store to 75 MiB of memory.
    object_spilling_config, temp_folder = object_spilling_config

    address = ray.init(object_store_memory=75 * 1024 * 1024,
                       _system_config={
                           "max_io_workers": 1,
                           "min_spilling_size": 0,
                           "automatic_object_spilling_enabled": True,
                           "object_store_full_delay_ms": 100,
                           "object_spilling_config": object_spilling_config,
                       })
    arr = np.random.rand(1024 * 1024)  # 8 MB data
    replay_buffer = []

    for _ in range(80):
        ref = None
        while ref is None:
            ref = ray.put(arr)
            replay_buffer.append(ref)

    print("-----------------------------------")

    del replay_buffer
    del ref
    wait_for_condition(lambda: is_dir_empty(temp_folder))
    assert_no_thrashing(address["redis_address"])
예제 #2
0
def test_delete_objects_on_worker_failure(object_spilling_config,
                                          shutdown_only):
    # Limit our object store to 75 MiB of memory.
    object_spilling_config, temp_folder = object_spilling_config

    address = ray.init(
        object_store_memory=75 * 1024 * 1024,
        _system_config={
            "max_io_workers": 4,
            "automatic_object_spilling_enabled": True,
            "object_store_full_delay_ms": 100,
            "object_spilling_config": object_spilling_config,
            "min_spilling_size": 0,
        },
    )

    arr = np.random.rand(1024 * 1024)  # 8 MB data

    @ray.remote
    class Actor:
        def __init__(self):
            self.replay_buffer = []

        def get_pid(self):
            return os.getpid()

        def create_objects(self):
            for _ in range(80):
                ref = None
                while ref is None:
                    ref = ray.put(arr)
                    self.replay_buffer.append(ref)
                # Remove the replay buffer with 60% probability.
                if random.randint(0, 9) < 6:
                    self.replay_buffer.pop()

            # Do random sampling.
            for _ in range(200):
                ref = random.choice(self.replay_buffer)
                sample = ray.get(ref, timeout=0)
                assert np.array_equal(sample, arr)

    a = Actor.remote()
    actor_pid = ray.get(a.get_pid.remote())
    ray.get(a.create_objects.remote())
    os.kill(actor_pid, 9)

    def wait_until_actor_dead():
        try:
            ray.get(a.get_pid.remote())
        except ray.exceptions.RayActorError:
            return True
        return False

    wait_for_condition(wait_until_actor_dead)

    # After all, make sure all objects are deleted upon worker failures.
    wait_for_condition(lambda: is_dir_empty(temp_folder))
    assert_no_thrashing(address["address"])
예제 #3
0
def test_spill_dir_cleanup_on_raylet_start(object_spilling_config):
    object_spilling_config, temp_folder = object_spilling_config
    cluster = Cluster()
    cluster.add_node(
        num_cpus=0,
        object_store_memory=75 * 1024 * 1024,
        _system_config={"object_spilling_config": object_spilling_config},
    )
    ray.init(address=cluster.address)
    node2 = cluster.add_node(num_cpus=1, object_store_memory=75 * 1024 * 1024)

    # This task will run on node 2 because node 1 has no CPU resource
    @ray.remote(num_cpus=1)
    def run_workload():
        ids = []
        for _ in range(2):
            arr = np.random.rand(5 * 1024 * 1024)  # 40 MB
            ids.append(ray.put(arr))
        return ids

    ids = ray.get(run_workload.remote())
    assert not is_dir_empty(temp_folder)

    # Kill node 2
    cluster.remove_node(node2)

    # Verify that the spill folder is not empty
    assert not is_dir_empty(temp_folder)

    # Start a new node
    cluster.add_node(num_cpus=1, object_store_memory=75 * 1024 * 1024)

    # Verify that the spill folder is now cleaned up
    assert is_dir_empty(temp_folder)

    # We hold the object refs to prevent them from being deleted
    del ids
    ray.shutdown()
    cluster.shutdown()
def test_file_deleted_when_driver_exits(tmp_path, shutdown_only):
    temp_folder = tmp_path / "spill"
    temp_folder.mkdir()

    driver = """
import json
import os
import signal
import numpy as np
import ray
ray.init(
    object_store_memory=75 * 1024 * 1024,
    _system_config={{
        "max_io_workers": 2,
        "min_spilling_size": 0,
        "automatic_object_spilling_enabled": True,
        "object_store_full_delay_ms": 100,
        "object_spilling_config": json.dumps({{
            "type": "filesystem",
            "params": {{
                "directory_path": "{temp_dir}"
            }}
        }}),
    }})
arr = np.random.rand(1024 * 1024)  # 8 MB data
replay_buffer = []
# Spill lots of objects
for _ in range(30):
    ref = None
    while ref is None:
        ref = ray.put(arr)
        replay_buffer.append(ref)
# Send sigterm to itself.
signum = {signum}
sig = None
if signum == 2:
    sig = signal.SIGINT
elif signum == 15:
    sig = signal.SIGTERM
os.kill(os.getpid(), sig)
"""

    # Run a driver with sigint.
    print("Sending sigint...")
    with pytest.raises(subprocess.CalledProcessError):
        print(
            run_string_as_driver(
                driver.format(temp_dir=str(temp_folder), signum=2)))
    wait_for_condition(lambda: is_dir_empty(temp_folder, append_path=""))
예제 #5
0
def test_delete_objects_delete_while_creating(object_spilling_config,
                                              shutdown_only):
    # Limit our object store to 75 MiB of memory.
    object_spilling_config, temp_folder = object_spilling_config

    address = ray.init(
        object_store_memory=75 * 1024 * 1024,
        _system_config={
            "max_io_workers": 4,
            "min_spilling_size": 0,
            "automatic_object_spilling_enabled": True,
            "object_store_full_delay_ms": 100,
            "object_spilling_config": object_spilling_config,
        },
    )
    arr = np.random.rand(1024 * 1024)  # 8 MB data
    replay_buffer = []

    for _ in range(80):
        ref = None
        while ref is None:
            ref = ray.put(arr)
            replay_buffer.append(ref)
        # Remove the replay buffer with 60% probability.
        if random.randint(0, 9) < 6:
            replay_buffer.pop()

    # Do random sampling.
    for _ in range(200):
        ref = random.choice(replay_buffer)
        sample = ray.get(ref, timeout=0)
        assert np.array_equal(sample, arr)

    # After all, make sure all objects are killed without race condition.
    del replay_buffer
    del ref
    wait_for_condition(lambda: is_dir_empty(temp_folder))
    assert_no_thrashing(address["address"])
def test_delete_objects_multi_node(multi_node_object_spilling_config,
                                   ray_start_cluster):
    # Limit our object store to 75 MiB of memory.
    object_spilling_config, temp_folder = multi_node_object_spilling_config

    cluster = ray_start_cluster
    # Head node.
    cluster.add_node(num_cpus=1,
                     object_store_memory=75 * 1024 * 1024,
                     _system_config={
                         "max_io_workers": 2,
                         "min_spilling_size": 20 * 1024 * 1024,
                         "automatic_object_spilling_enabled": True,
                         "object_store_full_delay_ms": 100,
                         "object_spilling_config": object_spilling_config,
                     })
    ray.init(address=cluster.address)
    # Add 2 worker nodes.
    for _ in range(2):
        cluster.add_node(num_cpus=1, object_store_memory=75 * 1024 * 1024)
    cluster.wait_for_nodes()

    arr = np.random.rand(1024 * 1024)  # 8 MB data

    @ray.remote(num_cpus=1)
    class Actor:
        def __init__(self):
            self.replay_buffer = []

        def ping(self):
            return

        def create_objects(self):
            for _ in range(80):
                ref = None
                while ref is None:
                    ref = ray.put(arr)
                    self.replay_buffer.append(ref)
                # Remove the replay buffer with 60% probability.
                if random.randint(0, 9) < 6:
                    self.replay_buffer.pop()

            # Do random sampling.
            for _ in range(50):
                ref = random.choice(self.replay_buffer)
                sample = ray.get(ref, timeout=10)
                assert np.array_equal(sample, arr)

    actors = [Actor.remote() for _ in range(3)]
    ray.get([actor.create_objects.remote() for actor in actors])

    def wait_until_actor_dead(actor):
        try:
            ray.get(actor.ping.remote())
        except ray.exceptions.RayActorError:
            return True
        return False

    # Kill actors to remove all references.
    for actor in actors:
        ray.kill(actor)
        wait_for_condition(lambda: wait_until_actor_dead(actor))
    # The multi node deletion should work.
    wait_for_condition(lambda: is_dir_empty(temp_folder))
    assert_no_thrashing(cluster.address)
예제 #7
0
def test_multiple_directories(tmp_path, shutdown_only):
    num_dirs = 3
    temp_dirs = []
    for i in range(num_dirs):
        temp_folder = tmp_path / f"spill_{i}"
        temp_folder.mkdir()
        temp_dirs.append(temp_folder)

    # Limit our object store to 75 MiB of memory.
    min_spilling_size = 0
    object_spilling_config = json.dumps({
        "type": "filesystem",
        "params": {
            "directory_path": [str(directory) for directory in temp_dirs]
        }
    })
    address = ray.init(
        object_store_memory=75 * 1024 * 1024,
        _system_config={
            "max_io_workers": 5,
            "object_store_full_delay_ms": 100,
            "object_spilling_config": object_spilling_config,
            "min_spilling_size": min_spilling_size,
        })

    arr = np.ones(74 * 1024 * 1024, dtype=np.uint8)  # 74MB.
    object_refs = []
    # Now the storage is full.
    object_refs.append(ray.put(arr))

    num_object_spilled = 20
    for _ in range(num_object_spilled):
        object_refs.append(ray.put(arr))

    num_files = defaultdict(int)
    for temp_dir in temp_dirs:
        temp_folder = temp_dir / ray.ray_constants.DEFAULT_OBJECT_PREFIX
        for path in temp_folder.iterdir():
            num_files[str(temp_folder)] += 1

    for ref in object_refs:
        assert np.array_equal(ray.get(ref), arr)

    print("Check distribution...")
    min_count = 5
    is_distributed = [n_files >= min_count for n_files in num_files.values()]
    assert all(is_distributed)

    print("Check deletion...")
    # Empty object refs.
    object_refs = []
    # Add a new object so that the last entry is evicted.
    ref = ray.put(arr)
    for temp_dir in temp_dirs:
        temp_folder = temp_dir
        wait_for_condition(lambda: is_dir_empty(temp_folder))
    assert_no_thrashing(address["redis_address"])

    # Now kill ray and see all directories are deleted.
    print("Check directories are deleted...")
    ray.shutdown()
    for temp_dir in temp_dirs:
        wait_for_condition(lambda: is_dir_empty(temp_dir, append_path=""))