Exemplo n.º 1
0
def test_spillbuffer_evict(tmpdir):
    buf = SpillBuffer(str(tmpdir), target=300, min_log_interval=0)

    bad = Bad(size=100)
    a = "a" * 100

    buf["a"] = a
    assert_buf(buf, {"a": a}, {})

    # successful eviction
    weight = buf.evict()
    assert weight == sizeof(a)
    assert_buf(buf, {}, {"a": a})

    buf["bad"] = bad
    assert_buf(buf, {"bad": bad}, {"a": a})

    # unsuccessful eviction
    with captured_logger(logging.getLogger("distributed.spill")) as logs_evict_key:
        weight = buf.evict()
    assert weight == -1

    assert "Failed to pickle" in logs_evict_key.getvalue()
    # bad keys stays in fast
    assert_buf(buf, {"bad": bad}, {"a": a})
Exemplo n.º 2
0
def test_weakref_cache(tmpdir, cls, expect_cached, size):
    buf = SpillBuffer(str(tmpdir), target=100)

    # Run this test twice:
    # - x is smaller than target and is evicted by y;
    # - x is individually larger than target and it never touches fast
    x = cls(size)
    buf["x"] = x
    if size < 100:
        buf["y"] = cls(60)  # spill x
    assert "x" in buf.slow

    # Test that we update the weakref cache on setitem
    assert (buf["x"] is x) == expect_cached

    # Do not use id_x = id(x), as in CPython id's are C memory addresses and are reused
    # by PyMalloc when you descope objects, so a brand new object might end up having
    # the same id as a deleted one
    id_x = x.id
    del x

    if size < 100:
        buf["y"]
    assert "x" in buf.slow

    x2 = buf["x"]
    assert x2.id != id_x
    if size < 100:
        buf["y"]
    assert "x" in buf.slow

    # Test that we update the weakref cache on getitem
    assert (buf["x"] is x2) == expect_cached
Exemplo n.º 3
0
def test_spillbuffer_fail_to_serialize(tmpdir):
    buf = SpillBuffer(str(tmpdir), target=200, max_spill=600, min_log_interval=0)

    # bad data individually larger than spill threshold target 200
    a = Bad(size=201)

    # Exception caught in the worker
    with pytest.raises(TypeError, match="Could not serialize"):
        with captured_logger(logging.getLogger("distributed.spill")) as logs_bad_key:
            buf["a"] = a

    # spill.py must remain silent because we're already logging in worker.py
    assert not logs_bad_key.getvalue()
    assert_buf(buf, {}, {})

    b = Bad(size=100)  # this is small enough to fit in memory/fast

    buf["b"] = b
    assert_buf(buf, {"b": b}, {})

    c = "c" * 100
    with captured_logger(logging.getLogger("distributed.spill")) as logs_bad_key_mem:
        # This will go to fast and try to kick b out,
        # but keep b in fast since it's not pickable
        buf["c"] = c

    # worker.py won't intercept the exception here, so spill.py must dump the traceback
    logs_value = logs_bad_key_mem.getvalue()
    assert "Failed to pickle" in logs_value  # from distributed.spill
    assert "Traceback" in logs_value  # from distributed.spill
    assert_buf(buf, {"b": b, "c": c}, {})
Exemplo n.º 4
0
def test_spillbuffer_maxlim(tmpdir):
    buf = SpillBuffer(str(tmpdir), target=200, max_spill=600, min_log_interval=0)

    a, b, c, d, e = "a" * 200, "b" * 100, "c" * 99, "d" * 199, "e" * 98

    # size of a is bigger than target and is smaller than max_spill;
    # key should be in slow
    buf["a"] = a
    assert_buf(buf, {}, {"a": a})
    assert buf["a"] == a

    # size of b is smaller than target key should be in fast
    buf["b"] = b
    assert_buf(buf, {"b": b}, {"a": a})

    # size of c is smaller than target but b+c > target, c should stay in fast and b
    # move to slow since the max_spill limit has not been reached yet
    buf["c"] = c
    assert_buf(buf, {"c": c}, {"a": a, "b": b})

    # size of e < target but e+c > target, this will trigger movement of c to slow
    # but the max spill limit prevents it. Resulting in e remaining in fast

    with captured_logger(logging.getLogger("distributed.spill")) as logs_e:
        buf["e"] = e

    assert "disk reached capacity" in logs_e.getvalue()
    assert_buf(buf, {"c": c, "e": e}, {"a": a, "b": b})

    # size of d > target, d should go to slow but slow reached the max_spill limit then
    # d will end up on fast with c (which can't be move to slow because it won't fit
    # either)
    with captured_logger(logging.getLogger("distributed.spill")) as logs_d:
        buf["d"] = d

    assert "disk reached capacity" in logs_d.getvalue()
    assert_buf(buf, {"c": c, "d": d, "e": e}, {"a": a, "b": b})

    # Overwrite a key that was in slow, but the size of the new key is larger than
    # max_spill

    a_large = "a" * 500
    assert psize(a_large)[1] > 600  # size of max_spill

    with captured_logger(logging.getLogger("distributed.spill")) as logs_alarge:
        buf["a"] = a_large

    assert "disk reached capacity" in logs_alarge.getvalue()
    assert_buf(buf, {"a": a_large, "d": d, "e": e}, {"b": b, "c": c})

    # Overwrite a key that was in fast, but the size of the new key is larger than
    # max_spill

    d_large = "d" * 501
    with captured_logger(logging.getLogger("distributed.spill")) as logs_dlarge:
        buf["d"] = d_large

    assert "disk reached capacity" in logs_dlarge.getvalue()
    assert_buf(buf, {"a": a_large, "d": d_large, "e": e}, {"b": b, "c": c})
Exemplo n.º 5
0
def test_spillbuffer(tmpdir):
    buf = SpillBuffer(str(tmpdir), target=300)
    # Convenience aliases
    assert buf.memory is buf.fast
    assert buf.disk is buf.slow

    assert_buf(buf, {}, {})

    a, b, c, d = "a" * 100, "b" * 99, "c" * 98, "d" * 97

    # Test assumption made by this test, mostly for non CPython implementations
    assert 100 < sizeof(a) < 200
    assert psize(a)[0] != psize(a)[1]

    buf["a"] = a
    assert_buf(buf, {"a": a}, {})
    assert buf["a"] == a

    buf["b"] = b
    assert_buf(buf, {"a": a, "b": b}, {})

    buf["c"] = c
    assert_buf(buf, {"b": b, "c": c}, {"a": a})

    assert buf["a"] == a
    assert_buf(buf, {"a": a, "c": c}, {"b": b})

    buf["d"] = d
    assert_buf(buf, {"a": a, "d": d}, {"b": b, "c": c})

    # Deleting an in-memory key does not automatically move spilled keys back to memory
    del buf["a"]
    assert_buf(buf, {"d": d}, {"b": b, "c": c})
    with pytest.raises(KeyError):
        buf["a"]

    # Deleting a spilled key updates the metadata
    del buf["b"]
    assert_buf(buf, {"d": d}, {"c": c})
    with pytest.raises(KeyError):
        buf["b"]

    # Updating a spilled key moves it to the top of the LRU and to memory
    c2 = c * 2
    buf["c"] = c2
    assert_buf(buf, {"c": c2}, {"d": d})

    # Single key is larger than target and goes directly into slow
    e = "e" * 500

    buf["e"] = e
    assert_buf(buf, {"c": c2}, {"d": d, "e": e})

    # Updating a spilled key with another larger than target updates slow directly
    d = "d" * 500
    buf["d"] = d
    assert_buf(buf, {"c": c2}, {"d": d, "e": e})
Exemplo n.º 6
0
def test_spillbuffer_evict(tmpdir):
    buf = SpillBuffer(str(tmpdir), target=300, min_log_interval=0)

    a_bad = Bad(size=100)
    a = "a" * 100

    buf["a"] = a

    assert set(buf.fast) == {"a"}
    assert not set(buf.slow)
    assert buf.fast.weights == {"a": sizeof(a)}

    # successful eviction
    weight = buf.evict()
    assert weight == sizeof(a)

    assert not buf.fast
    assert set(buf.slow) == {"a"}
    assert buf.slow.weight_by_key == {"a": psize(a)}

    buf["a_bad"] = a_bad

    assert set(buf.fast) == {"a_bad"}
    assert buf.fast.weights == {"a_bad": sizeof(a_bad)}
    assert set(buf.slow) == {"a"}
    assert buf.slow.weight_by_key == {"a": psize(a)}

    # unsuccessful eviction
    with captured_logger(
            logging.getLogger("distributed.spill")) as logs_evict_key:
        weight = buf.evict()
    assert weight == -1

    assert "Failed to pickle" in logs_evict_key.getvalue()
    # bad keys stays in fast
    assert set(buf.fast) == {"a_bad"}
    assert buf.fast.weights == {"a_bad": sizeof(a_bad)}
    assert set(buf.slow) == {"a"}
    assert buf.slow.weight_by_key == {"a": psize(a)}
Exemplo n.º 7
0
def test_spillbuffer_oserror(tmpdir):
    buf = SpillBuffer(str(tmpdir),
                      target=200,
                      max_spill=800,
                      min_log_interval=0)

    a, b, c, d = (
        "a" * 200,
        "b" * 100,
        "c" * 201,
        "d" * 101,
    )

    # let's have something in fast and something in slow
    buf["a"] = a
    buf["b"] = b
    assert set(buf.fast) == {"b"}
    assert set(buf.slow) == {"a"}

    # modify permissions of disk to be read only.
    # This causes writes to raise OSError, just like in case of disk full.
    os.chmod(tmpdir, 0o555)

    # Add key > than target
    with captured_logger(
            logging.getLogger("distributed.spill")) as logs_oserror_slow:
        buf["c"] = c

    assert "Spill to disk failed" in logs_oserror_slow.getvalue()
    assert set(buf.fast) == {"b", "c"}
    assert set(buf.slow) == {"a"}

    assert buf.slow.weight_by_key == {"a": psize(a)}
    assert buf.fast.weights == {"b": sizeof(b), "c": sizeof(c)}

    del buf["c"]
    assert set(buf.fast) == {"b"}
    assert set(buf.slow) == {"a"}

    # add key to fast which is smaller than target but when added it triggers spill,
    # which triggers OSError
    with captured_logger(
            logging.getLogger("distributed.spill")) as logs_oserror_evict:
        buf["d"] = d

    assert "Spill to disk failed" in logs_oserror_evict.getvalue()
    assert set(buf.fast) == {"b", "d"}
    assert set(buf.slow) == {"a"}

    assert buf.slow.weight_by_key == {"a": psize(a)}
    assert buf.fast.weights == {"b": sizeof(b), "d": sizeof(d)}
Exemplo n.º 8
0
def test_ensure_spilled_immediately(tmpdir):
    """See also test_value_raises_during_spilling"""
    import sys

    from distributed.spill import SpillBuffer

    mem_target = 1000
    buf = SpillBuffer(tmpdir, target=mem_target)
    buf["key"] = 1

    obj = LargeButForbiddenSerialization()
    assert sys.getsizeof(obj) > mem_target
    with pytest.raises(
        TypeError,
        match=f"Could not serialize object of type {LargeButForbiddenSerialization.__name__}",
    ):
        buf["error"] = obj
Exemplo n.º 9
0
def test_spillbuffer(tmpdir):
    buf = SpillBuffer(str(tmpdir), target=300)
    # Convenience aliases
    assert buf.memory is buf.fast
    assert buf.disk is buf.slow

    assert not buf.spilled_by_key
    assert buf.spilled_total == 0

    a, b, c, d = "a" * 100, "b" * 100, "c" * 100, "d" * 100
    s = sizeof(a)
    # Test assumption made by this test, mostly for non CPython implementations
    assert 100 < s < 200

    buf["a"] = a
    assert not buf.disk
    assert not buf.spilled_by_key
    assert buf.spilled_total == 0
    assert buf["a"] == a

    buf["b"] = b
    assert not buf.disk
    assert not buf.spilled_by_key
    assert buf.spilled_total == 0

    buf["c"] = c
    assert set(buf.disk) == {"a"}
    assert buf.spilled_by_key == {"a": s}
    assert buf.spilled_total == s

    assert buf["a"] == a
    assert set(buf.disk) == {"b"}
    assert buf.spilled_by_key == {"b": s}
    assert buf.spilled_total == s

    buf["d"] = d
    assert set(buf.disk) == {"b", "c"}
    assert buf.spilled_by_key == {"b": s, "c": s}
    assert buf.spilled_total == s * 2

    # Deleting an in-memory key does not automatically move spilled keys back to memory
    del buf["a"]
    assert set(buf.disk) == {"b", "c"}
    assert buf.spilled_by_key == {"b": s, "c": s}
    assert buf.spilled_total == s * 2
    with pytest.raises(KeyError):
        buf["a"]

    # Deleting a spilled key updates the metadata
    del buf["b"]
    assert set(buf.disk) == {"c"}
    assert buf.spilled_by_key == {"c": s}
    assert buf.spilled_total == s
    with pytest.raises(KeyError):
        buf["b"]

    # Updating a spilled key moves it to the top of the LRU and to memory
    buf["c"] = c * 2
    assert set(buf.disk) == {"d"}
    assert buf.spilled_by_key == {"d": s}
    assert buf.spilled_total == s

    # Single key is larger than target and goes directly into slow
    e = "e" * 500
    slarge = sizeof(e)
    buf["e"] = e
    assert set(buf.disk) == {"d", "e"}
    assert buf.spilled_by_key == {"d": s, "e": slarge}
    assert buf.spilled_total == s + slarge

    # Updating a spilled key with another larger than target updates slow directly
    buf["d"] = "d" * 500
    assert set(buf.disk) == {"d", "e"}
    assert buf.spilled_by_key == {"d": slarge, "e": slarge}
    assert buf.spilled_total == slarge * 2
Exemplo n.º 10
0
    def __init__(
        self,
        worker: Worker,
        *,
        memory_limit: str | float = "auto",
        # This should be None most of the times, short of a power user replacing the
        # SpillBuffer with their own custom dict-like
        data: (
            MutableMapping[str, Any]  # pre-initialised
            | Callable[[], MutableMapping[str, Any]]  # constructor
            | tuple[
                Callable[..., MutableMapping[str, Any]], dict[str, Any]
            ]  # (constructor, kwargs to constructor)
            | None  # create internally
        ) = None,
        # Deprecated parameters; use dask.config instead
        memory_target_fraction: float | Literal[False] | None = None,
        memory_spill_fraction: float | Literal[False] | None = None,
        memory_pause_fraction: float | Literal[False] | None = None,
    ):
        self.memory_limit = parse_memory_limit(memory_limit, worker.nthreads)

        self.memory_target_fraction = _parse_threshold(
            "distributed.worker.memory.target",
            "memory_target_fraction",
            memory_target_fraction,
        )
        self.memory_spill_fraction = _parse_threshold(
            "distributed.worker.memory.spill",
            "memory_spill_fraction",
            memory_spill_fraction,
        )
        self.memory_pause_fraction = _parse_threshold(
            "distributed.worker.memory.pause",
            "memory_pause_fraction",
            memory_pause_fraction,
        )

        max_spill = dask.config.get("distributed.worker.memory.max-spill")
        self.max_spill = False if max_spill is False else parse_bytes(max_spill)

        if isinstance(data, MutableMapping):
            self.data = data
        elif callable(data):
            self.data = data()
        elif isinstance(data, tuple):
            self.data = data[0](**data[1])
        elif self.memory_limit and (
            self.memory_target_fraction or self.memory_spill_fraction
        ):
            if self.memory_target_fraction:
                target = int(
                    self.memory_limit
                    * (self.memory_target_fraction or self.memory_spill_fraction)
                )
            else:
                target = sys.maxsize
            self.data = SpillBuffer(
                os.path.join(worker.local_directory, "storage"),
                target=target,
                max_spill=self.max_spill,
            )
        else:
            self.data = {}

        self._memory_monitoring = False

        self.memory_monitor_interval = parse_timedelta(
            dask.config.get("distributed.worker.memory.monitor-interval"),
            default=None,
        )
        assert isinstance(self.memory_monitor_interval, (int, float))

        if self.memory_limit and (
            self.memory_spill_fraction is not False
            or self.memory_pause_fraction is not False
        ):
            assert self.memory_monitor_interval is not None
            pc = PeriodicCallback(
                # Don't store worker as self.worker to avoid creating a circular
                # dependency. We could have alternatively used a weakref.
                # FIXME annotations: https://github.com/tornadoweb/tornado/issues/3117
                partial(self.memory_monitor, worker),  # type: ignore
                self.memory_monitor_interval * 1000,
            )
            worker.periodic_callbacks["memory_monitor"] = pc

        self._throttled_gc = ThrottledGC(logger=logger)
Exemplo n.º 11
0
def test_spillbuffer(tmpdir):
    buf = SpillBuffer(str(tmpdir), target=300)
    # Convenience aliases
    assert buf.memory is buf.fast
    assert buf.disk is buf.slow

    assert not buf.slow.weight_by_key
    assert buf.slow.total_weight == (0, 0)
    assert buf.spilled_total == (0, 0)

    a, b, c, d = "a" * 100, "b" * 99, "c" * 98, "d" * 97

    # Test assumption made by this test, mostly for non CPython implementations
    assert 100 < sizeof(a) < 200
    assert psize(a)[0] != psize(a)[1]

    buf["a"] = a
    assert not buf.slow
    assert buf.fast.weights == {"a": sizeof(a)}
    assert buf.fast.total_weight == sizeof(a)
    assert buf.slow.weight_by_key == {}
    assert buf.slow.total_weight == (0, 0)
    assert buf["a"] == a

    buf["b"] = b
    assert not buf.slow
    assert not buf.slow.weight_by_key
    assert buf.slow.total_weight == (0, 0)

    buf["c"] = c
    assert set(buf.slow) == {"a"}
    assert buf.slow.weight_by_key == {"a": psize(a)}
    assert buf.slow.total_weight == psize(a)

    assert buf["a"] == a
    assert set(buf.slow) == {"b"}
    assert buf.slow.weight_by_key == {"b": psize(b)}
    assert buf.slow.total_weight == psize(b)

    buf["d"] = d
    assert set(buf.slow) == {"b", "c"}
    assert buf.slow.weight_by_key == {"b": psize(b), "c": psize(c)}
    assert buf.slow.total_weight == psize(b, c)

    # Deleting an in-memory key does not automatically move spilled keys back to memory
    del buf["a"]
    assert set(buf.slow) == {"b", "c"}
    assert buf.slow.weight_by_key == {"b": psize(b), "c": psize(c)}
    assert buf.slow.total_weight == psize(b, c)
    with pytest.raises(KeyError):
        buf["a"]

    # Deleting a spilled key updates the metadata
    del buf["b"]
    assert set(buf.slow) == {"c"}
    assert buf.slow.weight_by_key == {"c": psize(c)}
    assert buf.slow.total_weight == psize(c)
    with pytest.raises(KeyError):
        buf["b"]

    # Updating a spilled key moves it to the top of the LRU and to memory
    buf["c"] = c * 2
    assert set(buf.slow) == {"d"}
    assert buf.slow.weight_by_key == {"d": psize(d)}
    assert buf.slow.total_weight == psize(d)

    # Single key is larger than target and goes directly into slow
    e = "e" * 500

    buf["e"] = e
    assert set(buf.slow) == {"d", "e"}
    assert buf.slow.weight_by_key == {"d": psize(d), "e": psize(e)}
    assert buf.slow.total_weight == psize(d, e)

    # Updating a spilled key with another larger than target updates slow directly
    d = "d" * 500
    buf["d"] = d
    assert set(buf.slow) == {"d", "e"}
    assert buf.slow.weight_by_key == {"d": psize(d), "e": psize(e)}
    assert buf.slow.total_weight == psize(d, e)