示例#1
0
def test_worker_task_refs(ray_start_regular):
    @ray.remote
    def f(y):
        x_id = ray.put("HI")
        info = memory_summary()
        del x_id
        return info

    x_id = f.remote(np.zeros(100000))
    info = ray.get(x_id)
    print(info)
    assert num_objects(info) == 4, info
    # Task argument plus task return ids.
    assert count(info, TASK_CALL_OBJ) == 2, info
    assert count(info, DRIVER_PID) == 1, info
    assert count(info, WORKER_PID) == 1, info
    assert count(info, LOCAL_REF) == 2, info
    assert count(info, PINNED_IN_MEMORY) == 1, info
    assert count(info, PUT_OBJ) == 1, info
    assert count(info, DESER_TASK_ARG) == 1, info
    assert count(info, UNKNOWN_SIZE) == 1, info
    assert count(info, "test_memstat.py:f") == 1, info
    assert count(info, "test_memstat.py:test_worker_task_refs") == 2, info

    info = memory_summary()
    print(info)
    assert num_objects(info) == 1, info
    assert count(info, DRIVER_PID) == 1, info
    assert count(info, TASK_CALL_OBJ) == 1, info
    assert count(info, UNKNOWN_SIZE) == 0, info
    assert count(info, x_id.hex()) == 1, info

    del x_id
    info = memory_summary()
    assert num_objects(info) == 0, info
示例#2
0
def test_pinned_object_call_site(ray_start_regular):
    address = ray_start_regular["address"]
    # Local ref only.
    x_id = ray.put(np.zeros(100000))
    info = memory_summary(address)
    print(info)
    assert num_objects(info) == 1, info
    assert count(info, LOCAL_REF) == 1, info
    assert count(info, PINNED_IN_MEMORY) == 0, info

    # Local ref + pinned buffer.
    buf = ray.get(x_id)
    info = memory_summary(address)
    print(info)
    assert num_objects(info) == 1, info
    assert count(info, LOCAL_REF) == 0, info
    assert count(info, PINNED_IN_MEMORY) == 1, info

    # Just pinned buffer.
    del x_id
    info = memory_summary(address)
    print(info)
    assert num_objects(info) == 1, info
    assert count(info, LOCAL_REF) == 0, info
    assert count(info, PINNED_IN_MEMORY) == 1, info

    # Nothing.
    del buf
    info = memory_summary(address)
    print(info)
    assert num_objects(info) == 0, info
示例#3
0
 def f(y):
     x_id = ray.put("HI")
     info_a = memory_summary(group_by="STACK_TRACE",
                             sort_by="REFERENCE_TYPE")
     info_b = memory_summary(group_by="NODE_ADDRESS", sort_by="OBJECT_SIZE")
     info_c = memory_summary(group_by="NODE_ADDRESS", sort_by="PID")
     del x_id
     return info_a, info_b, info_c
示例#4
0
def test_memory_release_eager(shutdown_only):
    info = ray.init(num_cpus=1, object_store_memory=1500e6)
    ds = ray.data.range(10)

    # Round 1.
    ds = ds.map(lambda x: np.ones(100 * 1024 * 1024, dtype=np.uint8))
    meminfo = memory_summary(info.address_info["address"], stats_only=True)
    assert "Spilled" not in meminfo, meminfo

    # Round 2.
    ds = ds.map(lambda x: np.ones(100 * 1024 * 1024, dtype=np.uint8))
    meminfo = memory_summary(info["address"], stats_only=True)
示例#5
0
def test_driver_put_ref(ray_start_regular):
    info = memory_summary()
    assert num_objects(info) == 0, info
    x_id = ray.put("HI")
    info = memory_summary()
    print(info)
    assert num_objects(info) == 1, info
    assert count(info, DRIVER_PID) == 1, info
    assert count(info, WORKER_PID) == 0, info
    del x_id
    info = memory_summary()
    assert num_objects(info) == 0, info
示例#6
0
def test_multi_node_stats(shutdown_only):
    cluster = Cluster()
    for _ in range(2):
        cluster.add_node(num_cpus=1)

    ray.init(address=cluster.address)

    @ray.remote(num_cpus=1)
    class Actor:
        def __init__(self):
            self.ref = ray.put(np.zeros(100000))

        def ping(self):
            pass

    # Each actor will be on a different node.
    a = Actor.remote()
    b = Actor.remote()
    ray.get(a.ping.remote())
    ray.get(b.ping.remote())

    # Verify we have collected stats across the nodes.
    info = memory_summary()
    print(info)
    assert count(info, PUT_OBJ) == 2, info
示例#7
0
def test_spill_stats(object_spilling_config, shutdown_only):
    # Limit our object store to 75 MiB of memory.
    object_spilling_config, _ = object_spilling_config
    ray.init(
        num_cpus=1,
        object_store_memory=100 * 1024 * 1024,
        _system_config={
            "automatic_object_spilling_enabled": True,
            "max_io_workers": 100,
            "min_spilling_size": 1,
            "object_spilling_config": object_spilling_config
        },
    )

    @ray.remote
    def f():
        return np.zeros(50 * 1024 * 1024, dtype=np.uint8)

    ids = []
    for _ in range(4):
        x = f.remote()
        ids.append(x)

    while ids:
        print(ray.get(ids.pop()))

    x_id = f.remote()  # noqa
    ray.get(x_id)
    s = memory_summary()
    assert "Plasma memory usage 50 MiB, 1 objects, 50.0% full" in s, s
    assert "Spilled 200 MiB, 4 objects" in s, s
    assert "Restored 150 MiB, 3 objects" in s, s
示例#8
0
 def ok():
     s = memory_summary(address=address["address"], stats_only=True)
     print(s)
     if restored:
         if "Restored {} MiB".format(restored) not in s:
             return False
     else:
         if "Restored" in s:
             return False
     if spilled:
         if not isinstance(spilled, list):
             spilled_lst = [spilled]
         else:
             spilled_lst = spilled
         found = False
         for n in spilled_lst:
             if "Spilled {} MiB".format(n) in s:
                 found = True
         if not found:
             return False
     else:
         if "Spilled" in s:
             return False
     if fallback:
         if "Plasma filesystem mmap usage: {} MiB".format(
                 fallback) not in s:
             return False
     else:
         if "Plasma filesystem mmap usage:" in s:
             return False
     return True
示例#9
0
    def f(y):
        from ray.internal.internal_api import memory_summary

        x_id = ray.put("HI")
        info = memory_summary(address)
        del x_id
        return info
示例#10
0
def test_multi_node_stats(shutdown_only):
    # NOTE(mwtian): using env var only enables the feature on workers, while
    # using head_node_args={"_system_config": ray_config} only enables the
    # feature on the driver.
    os.environ["RAY_record_ref_creation_sites"] = "1"
    cluster = Cluster()
    for _ in range(2):
        cluster.add_node(num_cpus=1)

    ray.init(address=cluster.address)

    @ray.remote(num_cpus=1)
    class Actor:
        def __init__(self):
            self.ref = ray.put(np.zeros(100000))

        def ping(self):
            pass

    # Each actor will be on a different node.
    a = Actor.remote()
    b = Actor.remote()
    ray.get(a.ping.remote())
    ray.get(b.ping.remote())

    # Verify we have collected stats across the nodes.
    info = memory_summary(cluster.address)
    print(info)
    assert count(info, PUT_OBJ) == 2, info
示例#11
0
def test_actor_task_refs(ray_start_regular):
    address = ray_start_regular["address"]

    @ray.remote
    class Actor:
        def __init__(self):
            self.refs = []

        def f(self, x):
            from ray.internal.internal_api import memory_summary

            self.refs.append(x)
            return memory_summary(address)

    def make_actor():
        return Actor.remote()

    actor = make_actor()
    x_id = actor.f.remote(np.zeros(100000))
    info = ray.get(x_id)
    print(info)
    # Note, the actor will always hold a handle to the actor itself.
    assert num_objects(info) == 5, info
    # Actor handle, task argument id, task return id.
    assert count(info, ACTOR_TASK_CALL_OBJ) == 3, info
    assert count(info, DRIVER_PID) == 3, info
    assert count(info, WORKER_PID) == 2, info
    assert count(info, LOCAL_REF) == 1, info
    assert count(info, PINNED_IN_MEMORY) == 1, info
    assert count(info, USED_BY_PENDING_TASK) == 1, info
    assert count(info, ACTOR_HANDLE) == 2, info
    assert count(info, DESER_ACTOR_TASK_ARG) == 1, info
    del x_id

    # These should accumulate in the actor.
    for _ in range(5):
        ray.get(actor.f.remote([ray.put(np.zeros(100000))]))
    info = memory_summary(address)
    print(info)
    assert count(info, DESER_ACTOR_TASK_ARG) == 5, info
    assert count(info, ACTOR_TASK_CALL_OBJ) == 1, info

    # Cleanup.
    del actor
    time.sleep(1)
    info = memory_summary(address)
    assert num_objects(info) == 0, info
示例#12
0
def test_memory_used_output(ray_start_regular):
    import numpy as np
    _ = ray.put(np.ones(8 * 1024 * 1024, dtype=np.int8))

    info = memory_summary()
    print(info)
    assert count(info, "Plasma memory usage 8 MiB") == 1, info
    assert count(info, "8388861 B") == 2, info
示例#13
0
 def filtered_summary():
     return "\n".join(
         [
             line
             for line in memory_summary(address, line_wrap=False).split("\n")
             if "ACTOR_HANDLE" not in line
         ]
     )
示例#14
0
def check_no_spill(ctx, pipe, prefetch_blocks: int = 0):
    # Run .iter_batches() for 10 secs, and we expect no object spilling.
    end_time = time.time() + 10
    for batch in pipe.iter_batches(prefetch_blocks=prefetch_blocks):
        if time.time() > end_time:
            break
    meminfo = memory_summary(ctx.address_info["address"], stats_only=True)
    assert "Spilled" not in meminfo, meminfo
示例#15
0
def test_spill_stats(object_spilling_config, shutdown_only):
    # Limit our object store to 75 MiB of memory.
    object_spilling_config, _ = object_spilling_config
    address = ray.init(
        num_cpus=1,
        object_store_memory=100 * 1024 * 1024,
        _system_config={
            "automatic_object_spilling_enabled": True,
            "max_io_workers": 100,
            "min_spilling_size": 1,
            "object_spilling_config": object_spilling_config,
        },
    )

    @ray.remote
    def f():
        return np.zeros(50 * 1024 * 1024, dtype=np.uint8)

    ids = []
    for _ in range(4):
        x = f.remote()
        ids.append(x)

    while ids:
        print(ray.get(ids.pop()))

    x_id = f.remote()  # noqa
    ray.get(x_id)
    s = memory_summary(address=address["address"], stats_only=True)
    assert "Plasma memory usage 50 MiB, 1 objects, 50.0% full" in s, s
    assert "Spilled 200 MiB, 4 objects" in s, s
    assert "Restored 150 MiB, 3 objects" in s, s

    # Test if consumed bytes are correctly calculated.
    obj = ray.put(np.zeros(30 * 1024 * 1024, dtype=np.uint8))

    @ray.remote
    def func_with_ref(obj):
        return True

    ray.get(func_with_ref.remote(obj))

    s = memory_summary(address=address["address"], stats_only=True)
    # 50MB * 5 references + 30MB used for task execution.
    assert "Objects consumed by Ray tasks: 280 MiB." in s, s
    assert_no_thrashing(address["address"])
示例#16
0
def test_memory_sanity(shutdown_only):
    info = ray.init(num_cpus=1, object_store_memory=500e6)
    ds = ray.data.range(10)
    ds = ds.map(lambda x: np.ones(100 * 1024 * 1024, dtype=np.uint8))
    meminfo = memory_summary(info.address_info["address"], stats_only=True)

    # Sanity check spilling is happening as expected.
    assert "Spilled" in meminfo, meminfo
示例#17
0
def test_memory_release_pipeline(shutdown_only, lazy_input):
    context = DatasetContext.get_current()
    # Disable stage fusion so we can keep reads and maps from being fused together,
    # since we're trying to test multi-stage memory releasing here.
    context.optimize_fuse_stages = False
    # This object store allocation can hold at most 1 copy of the transformed dataset.
    if lazy_input:
        object_store_memory = 3000e6
    else:
        object_store_memory = 3000e6

    n = 10
    info = ray.init(num_cpus=n, object_store_memory=object_store_memory)
    if lazy_input:
        ds = ray.data.read_datasource(
            OnesSource(),
            parallelism=n,
            n_per_block=100 * 1024 * 1024,
        )
    else:
        ds = ray.data.from_items(list(range(n)), parallelism=n)

    # Create a single-window pipeline.
    pipe = ds.window(blocks_per_window=n)

    # Round 1.
    def gen(x):
        import time

        # TODO(Clark): Remove this sleep once we have fixed memory pressure handling.
        time.sleep(2)
        if isinstance(x, np.ndarray):
            return x
        else:
            return np.ones(100 * 1024 * 1024, dtype=np.uint8)

    pipe = pipe.map(gen)

    def inc(x):
        import time

        # TODO(Clark): Remove this sleep once we have fixed memory pressure handling.
        time.sleep(2)
        return x + 1

    num_rounds = 10
    for _ in range(num_rounds):
        pipe = pipe.map(inc)

    for block in pipe.iter_batches(batch_size=None):
        for arr in block:
            np.testing.assert_equal(
                arr,
                np.ones(100 * 1024 * 1024, dtype=np.uint8) + num_rounds,
            )
    meminfo = memory_summary(info["address"], stats_only=True)
    assert "Spilled" not in meminfo, meminfo
示例#18
0
def test_actor_task_refs(ray_start_regular):
    @ray.remote
    class Actor:
        def __init__(self):
            self.refs = []

        def f(self, x):
            self.refs.append(x)
            return memory_summary()

    def make_actor():
        return Actor.remote()

    actor = make_actor()
    x_id = actor.f.remote(np.zeros(100000))
    info = ray.get(x_id)
    print(info)
    assert num_objects(info) == 4, info
    # Actor handle, task argument id, task return id.
    assert count(info, ACTOR_TASK_CALL_OBJ) == 3, info
    assert count(info, DRIVER_PID) == 1, info
    assert count(info, WORKER_PID) == 1, info
    assert count(info, LOCAL_REF) == 1, info
    assert count(info, PINNED_IN_MEMORY) == 1, info
    assert count(info, USED_BY_PENDING_TASK) == 2, info
    assert count(info, DESER_ACTOR_TASK_ARG) == 1, info
    assert count(info, "test_memstat.py:test_actor_task_refs") == 3, info
    assert count(info, "test_memstat.py:make_actor") == 1, info
    del x_id

    # These should accumulate in the actor.
    for _ in range(5):
        ray.get(actor.f.remote([ray.put(np.zeros(100000))]))
    info = memory_summary()
    print(info)
    assert count(info, DESER_ACTOR_TASK_ARG) == 5, info
    assert count(info, ACTOR_TASK_CALL_OBJ) == 1, info

    # Cleanup.
    del actor
    time.sleep(1)
    info = memory_summary()
    assert num_objects(info) == 0, info
示例#19
0
def test_nested_object_refs(ray_start_regular):
    x_id = ray.put(np.zeros(100000))
    y_id = ray.put([x_id])
    z_id = ray.put([y_id])
    del x_id, y_id
    info = memory_summary()
    print(info)
    assert num_objects(info) == 3, info
    assert count(info, LOCAL_REF) == 1, info
    assert count(info, CAPTURED_IN_OBJECT) == 2, info
    del z_id
示例#20
0
def test_memory_release_lazy(shutdown_only):
    info = ray.init(num_cpus=1, object_store_memory=1500e6)
    ds = ray.data.range(10)

    # Should get fused into single stage.
    ds = ds._experimental_lazy()
    ds = ds.map(lambda x: np.ones(100 * 1024 * 1024, dtype=np.uint8))
    ds = ds.map(lambda x: np.ones(100 * 1024 * 1024, dtype=np.uint8))
    ds = ds.map(lambda x: np.ones(100 * 1024 * 1024, dtype=np.uint8))
    ds.fully_executed()
    meminfo = memory_summary(info.address_info["address"], stats_only=True)
    assert "Spilled" not in meminfo, meminfo
示例#21
0
def test_worker_task_refs(ray_start_regular):
    address = ray_start_regular["address"]

    @ray.remote
    def f(y):
        from ray.internal.internal_api import memory_summary

        x_id = ray.put("HI")
        info = memory_summary(address)
        del x_id
        return info

    x_id = f.remote(np.zeros(100000))
    info = ray.get(x_id)
    print(info)
    assert num_objects(info) == 4, info
    # Task argument plus task return ids.
    assert count(info, TASK_CALL_OBJ) == 2, info
    assert count(info, DRIVER_PID) == 2, info
    assert count(info, WORKER_PID) == 2, info
    assert count(info, LOCAL_REF) == 2, info
    assert count(info, PINNED_IN_MEMORY) == 1, info
    assert count(info, PUT_OBJ) == 1, info
    assert count(info, DESER_TASK_ARG) == 1, info
    assert count(info, UNKNOWN_SIZE) == 1, info

    print(ray_start_regular)
    info = memory_summary(address)
    print(info)
    assert num_objects(info) == 1, info
    assert count(info, DRIVER_PID) == 1, info
    assert count(info, TASK_CALL_OBJ) == 1, info
    assert count(info, UNKNOWN_SIZE) == 0, info
    assert count(info, x_id.hex()) == 1, info

    del x_id
    info = memory_summary(address)
    assert num_objects(info) == 0, info
示例#22
0
def assert_no_thrashing(address):
    state = ray.state.GlobalState()
    options = GcsClientOptions.from_gcs_address(address)
    state._initialize_global_state(options)
    summary = memory_summary(address=address, stats_only=True)
    restored_bytes = 0
    consumed_bytes = 0

    for line in summary.split("\n"):
        if "Restored" in line:
            restored_bytes = int(line.split(" ")[1])
        if "consumed" in line:
            consumed_bytes = int(line.split(" ")[-2])
    assert (consumed_bytes >= restored_bytes
            ), f"consumed: {consumed_bytes}, restored: {restored_bytes}"
示例#23
0
def test_memory_release_lazy(shutdown_only):
    context = DatasetContext.get_current()
    # Ensure that stage fusion is enabled.
    context.optimize_fuse_stages = True
    info = ray.init(num_cpus=1, object_store_memory=1500e6)
    ds = ray.data.range(10)

    # Should get fused into single stage.
    ds = ds.experimental_lazy()
    ds = ds.map(lambda x: np.ones(100 * 1024 * 1024, dtype=np.uint8))
    ds = ds.map(lambda x: np.ones(100 * 1024 * 1024, dtype=np.uint8))
    ds = ds.map(lambda x: np.ones(100 * 1024 * 1024, dtype=np.uint8))
    ds.fully_executed()
    meminfo = memory_summary(info.address_info["address"], stats_only=True)
    assert "Spilled" not in meminfo, meminfo
示例#24
0
def assert_no_thrashing(address):
    state = ray.state.GlobalState()
    state._initialize_global_state(address,
                                   ray.ray_constants.REDIS_DEFAULT_PASSWORD)
    summary = memory_summary(address=address, stats_only=True)
    restored_bytes = 0
    consumed_bytes = 0

    for line in summary.split("\n"):
        if "Restored" in line:
            restored_bytes = int(line.split(" ")[1])
        if "consumed" in line:
            consumed_bytes = int(line.split(" ")[-2])
    assert consumed_bytes >= restored_bytes, (
        f"consumed: {consumed_bytes}, restored: {restored_bytes}")
示例#25
0
 def ok():
     s = memory_summary(address=address["redis_address"], stats_only=True)
     print(s)
     if restored:
         if "Restored {} MiB".format(restored) not in s:
             return False
     else:
         if "Restored" in s:
             return False
     if spilled:
         if "Spilled {} MiB".format(spilled) not in s:
             return False
     else:
         if "Spilled" in s:
             return False
     return True
示例#26
0
def test_spill_stats(tmp_path, shutdown_only):
    # Limit our object store to 75 MiB of memory.
    temp_folder = tmp_path / "spill"
    temp_folder.mkdir()
    ray.init(
        num_cpus=1,
        object_store_memory=100 * 1024 * 1024,
        _system_config={
            "automatic_object_spilling_enabled":
            True,
            "max_io_workers":
            100,
            "min_spilling_size":
            1,
            "object_spilling_config":
            json.dumps(
                {
                    "type": "filesystem",
                    "params": {
                        "directory_path": str(temp_folder)
                    }
                },
                separators=(",", ":"))
        },
    )

    @ray.remote
    def f():
        return np.zeros(50 * 1024 * 1024, dtype=np.uint8)

    ids = []
    for _ in range(4):
        x = f.remote()
        ids.append(x)

    while ids:
        print(ray.get(ids.pop()))

    x_id = f.remote()  # noqa
    ray.get(x_id)
    s = memory_summary()
    assert "Plasma memory usage 50 MiB, 1 objects, 50.0% full" in s, s
    assert "Spilled 200 MiB, 4 objects" in s, s
    assert "Restored 150 MiB, 3 objects" in s, s
示例#27
0
 def stats():
     info = memory_summary(cluster.address, line_wrap=False)
     info = info.split("\n")
     reconstructing_waiting = [
         line
         for line in info
         if "Attempt #2" in line and WAITING_FOR_DEPENDENCIES in line
     ]
     reconstructing_scheduled = [
         line for line in info if "Attempt #2" in line and SCHEDULED in line
     ]
     reconstructing_finished = [
         line for line in info if "Attempt #2" in line and FINISHED in line
     ]
     return (
         len(reconstructing_waiting),
         len(reconstructing_scheduled),
         len(reconstructing_finished),
     )
示例#28
0
def assert_no_thrashing(address):
    state = ray.state.GlobalState()
    if use_gcs_for_bootstrap():
        options = GcsClientOptions.from_gcs_address(address)
    else:
        options = GcsClientOptions.from_redis_address(
            address, ray.ray_constants.REDIS_DEFAULT_PASSWORD)
    state._initialize_global_state(options)
    summary = memory_summary(address=address, stats_only=True)
    restored_bytes = 0
    consumed_bytes = 0

    for line in summary.split("\n"):
        if "Restored" in line:
            restored_bytes = int(line.split(" ")[1])
        if "consumed" in line:
            consumed_bytes = int(line.split(" ")[-2])
    assert (consumed_bytes >= restored_bytes
            ), f"consumed: {consumed_bytes}, restored: {restored_bytes}"
示例#29
0
def test_memory_release_lazy_shuffle(shutdown_only):
    # TODO(ekl) why is this flaky? Due to eviction delay?
    error = None
    for trial in range(3):
        print("Try", trial)
        try:
            info = ray.init(num_cpus=1, object_store_memory=1800e6)
            ds = ray.data.range(10)

            # Should get fused into single stage.
            ds = ds._experimental_lazy()
            ds = ds.map(lambda x: np.ones(100 * 1024 * 1024, dtype=np.uint8))
            ds.random_shuffle().fully_executed()
            meminfo = memory_summary(info.address_info["address"], stats_only=True)
            assert "Spilled" not in meminfo, meminfo
            return
        except Exception as e:
            error = e
            print("Failed", e)
        finally:
            ray.shutdown()
    raise error
示例#30
0
 def ok():
     s = memory_summary(address=address["redis_address"], stats_only=True)
     print(s)
     if restored:
         if "Restored {} MiB".format(restored) not in s:
             return False
     else:
         if "Restored" in s:
             return False
     if spilled:
         if "Spilled {} MiB".format(spilled) not in s:
             return False
     else:
         if "Spilled" in s:
             return False
     if fallback:
         if "Plasma filesystem mmap usage: {} MiB".format(
                 fallback) not in s:
             return False
     else:
         if "Plasma filesystem mmap usage:" in s:
             return False
     return True