Пример #1
0
def test_series_squash_stability():
    label = "LABEL"
    local_repo = Repo()
    local_coll = local_repo.create_collection(schema, "a_collection")
    remote_repo = Repo()
    remote_coll = remote_repo.create_collection(schema, "a_collection")
    series = local_coll / label

    months = list(range(1, 12))
    delta = timedelta(days=1)
    for start, stop in zip(months[:-1], months[1:]):
        ts = drange(f"2020-{start:02}-01", f"2020-{stop:02}-01", delta)
        values = [start] * len(ts)
        series.write({"timestamp": ts, "value": values})

    local_coll.push(remote_coll)
    local_coll.squash()
    remote_coll.squash()

    local_files = local_coll.pod.walk()
    remote_files = remote_coll.pod.walk()

    local_digests = set(
        Revision.from_path(local_coll.changelog, f).digests
        for f in local_files if "." in f)
    remote_digests = set(
        Revision.from_path(remote_coll.changelog, f).digests
        for f in remote_files if "." in f)
    assert local_digests == remote_digests
Пример #2
0
def test_refresh():
    pod = MemPOD(".")
    repo = Repo(pod=pod)

    repo.create_collection(SCHEMA, "collection")
    assert repo.ls() == ["collection"]
    repo2 = Repo(pod=pod)
    repo2.delete("collection")
    # repo is out of sync
    assert repo.ls() == ["collection"]
    # refresh slove ths
    repo.refresh()
    assert repo.ls() == []
Пример #3
0
def test_label_delete_push(squash):
    kv_schema = Schema.kv(timestamp="int*", value="float")

    labels = list("abcd")
    local_repo = Repo()
    local_clct = local_repo.create_collection(kv_schema, "a_collection")
    remote_repo = Repo()
    remote_clct = remote_repo.create_collection(kv_schema, "a_collection")

    # Write some data
    frm = {
        "timestamp": [1, 2, 3],
        "value": [1, 2, 3],
    }
    for label in labels:
        series = local_clct / label
        series.write(frm)

    # Create some labels and push them
    local_clct.push(remote_clct)
    if squash:
        remote_clct.squash()
    assert local_clct.ls() == labels
    assert remote_clct.ls() == labels

    # Delete one local label and push again
    local_clct.delete("c")
    local_clct.push(remote_clct)
    if squash:
        remote_clct.merge()
        remote_clct.squash()

    else:
        remote_clct.refresh()

    assert remote_clct.ls() == list("abd")
    assert local_clct.ls() == list("abd")

    # Delete one remote label and pull
    sleep(0.1)  # Needed to avoid concurrent writes
    remote_clct.delete("d")
    local_clct.pull(remote_clct)
    if squash:
        local_clct.squash()
    else:
        local_clct.refresh()
    assert remote_clct.ls() == list("ab")
    assert local_clct.ls() == list("ab")
Пример #4
0
def test_insert(pod):
    # Write with workers
    label = "my_label"
    repo = Repo(pod=pod)
    # Create collection and label
    collection = repo.create_collection(schema, "my_collection")
    token = pod.token
    cluster = LocalCluster(processes=False)
    client = Client(cluster)
    args = [(token, label, y) for y in years]
    with timeit(f"\nWRITE ({pod.protocol})"):
        fut = client.map(insert, args)
        assert sum(client.gather(fut)) == 10_519_200
    client.close()
    cluster.close()

    # Merge everything and read series
    with timeit(f"\nMERGE ({pod.protocol})"):
        collection.merge()

    with timeit(f"\nREAD ({pod.protocol})"):
        series = collection / label
        df = series["2015-01-01":"2015-01-02"].df()
        assert len(df) == 1440
        df = series["2015-12-31":"2016-01-02"].df()
        assert len(df) == 2880
Пример #5
0
def test_label_regexp():
    repo = Repo()
    ok = ["abc", "abc-abc-123", "abc_abc-123.45", "abc+abc", "$", "é"]
    for label in ok:
        repo.create_collection(SCHEMA, label)
        repo.create_collection(SCHEMA, label.upper(), raise_if_exists=False)

    not_ok = ["", "\t", "\n"]
    for label in not_ok:
        with pytest.raises(ValueError):
            repo.create_collection(SCHEMA, label)
        with pytest.raises(ValueError):
            repo.create_collection(SCHEMA, label + " ")
Пример #6
0
def test_series_shallow_pull(size, direction, shallow):
    label = "LABEL"
    local_repo = Repo()
    remote_repo = Repo()
    local_coll = local_repo.create_collection(schema, "a_collection")
    series = local_coll / label

    series.write({"timestamp": arange(size), "value": arange(size)})
    series.write({"timestamp": arange(size), "value": arange(size) * 2})

    if direction == "pull":
        remote_repo.pull(local_repo, shallow=shallow)
    else:
        local_repo.push(remote_repo, shallow=shallow)

    remote_clc = remote_repo / "a_collection"
    assert len(remote_clc.changelog.log()) == (1 if shallow else 2)

    remote_series = remote_clc / label
    expected = series.frame()
    assert remote_series.frame() == expected
Пример #7
0
def test_gc():
    # Create pod, repo & collection
    pod = POD.from_uri("memory://")
    token = pod.token
    label = "my_label"
    repo = Repo(pod=pod)
    clc = repo.create_collection(schema, "my_collection")
    # Start cluster & schedule concurrent writes & gc
    cluster = LocalCluster(processes=False)
    client = Client(cluster)
    args = [(token, label, y) for y in years]
    insert_fut = client.map(insert, args)
    gc_fut = client.submit(do_squash_and_gc, token)
    assert sum(client.gather(insert_fut)) == 10_519_200
    client.gather(gc_fut)
    client.close()
    cluster.close()
    # Read data back
    clc.merge()
    frm = clc.series("my_label").frame()
    assert len(frm) == 10_519_200
Пример #8
0
df = DataFrame({
    "ts": timestamps,
    "value": values,
})

df.to_csv(f"timeseries-{suffix}.csv")
df.to_parquet(f"timeseries-{suffix}.snappy.pqt", compression='snappy')
df.to_parquet(f"timeseries-{suffix}.brotli.pqt", compression='brotli')
with timeit('pqt'):
    df.to_parquet(f"timeseries-{suffix}.gzip.pqt", compression='gzip')

repo = Repo("repo")
schema = Schema(ts="timestamp*", value="float")
clct = repo / "my_collection"
if not clct:
    clct = repo.create_collection(schema, "my_collection")
series = clct / "my_series"

with timeit('lk'):
    series.write(df)

## Results

# $ python examples/data_size.py
# pqt 198.76ms
# lk 24.24ms

# $ du -hs timeseries-* repo
# 1,4M	timeseries-17a813a84a1.brotli.pqt
# 4,4M	timeseries-17a813a84a1.csv
# 1,5M	timeseries-17a813a84a1.gzip.pqt
Пример #9
0
from lakota import Repo, Schema

# TODO use a KVSeries instead (it solve the problem explained at the bottom)

ts_schema = Schema(
    timestamp="timestamp*",
    pubtime="timestamp*",
    value="float",
)
repo = Repo()
clc = repo.create_collection(ts_schema, "my-collection")
srs = clc / "my_series"

# First insertion
df = {
    "timestamp": [
        "2020-01-01T00:00",
        "2020-01-01T00:00",
        "2020-01-02T00:00",
        "2020-01-02T00:00",
        "2020-01-03T00:00",
        "2020-01-03T00:00",
        "2020-01-04T00:00",
        "2020-01-04T00:00",
    ],
    "pubtime": [
        "2020-01-01T00:00",
        "2020-01-02T00:00",
        "2020-01-01T00:00",
        "2020-01-02T00:00",
        "2020-01-01T00:00",
Пример #10
0
CHUNK_SIZES = (500, 5_000, 50_000, 500_000)


def create_df(start, stop):
    ts = arange(start, stop)
    value = arange(start, stop)
    random.shuffle(value)

    return DataFrame({"timestamp": ts, "value": value})


def call(cmd):
    proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
    stdout, _ = proc.communicate()
    return stdout


for chunk_size in CHUNK_SIZES:
    df = create_df(0, SIZE)
    with timeit(f"chunk size {chunk_size}:"):
        schema = Schema(timestamp="timestamp*", value="float")
        repo = Repo("test-db")
        collection = repo.create_collection(schema, "test")
        series = collection / "test"
        for i in range(0, SIZE, chunk_size):
            series.write(df[i : i + chunk_size])
    res = call("du -hs test-db")
    print("Disk use", res.split()[0].decode())
    call("rm -r test-db")
Пример #11
0
schema = Schema(key="int*", **{x: "float" for x in cols})
frm = {
    "key": range(SIZE),
}
for x in cols:
    frm[x] = sin(arange(SIZE))


# Simulate network lag
def lag(fn, delay):
    def wrapper(*a, **kw):
        sleep(delay)
        return fn(*a, **kw)

    return wrapper


mempod_write = MemPOD.write

for delay in (0.001, 0.01, 0.1):
    MemPOD.write = lag(mempod_write, delay)
    for threaded in (False, True):
        settings.threaded = threaded
        with timeit(f"{delay}-{threaded}"):
            repo = Repo()
            clc = repo.create_collection(schema, "clc")
            with clc.multi():
                for name in "ABC":
                    series = clc / name
                    series.write(frm)
Пример #12
0
def test_pull(threaded, large):
    c_label = "a_collection"
    s_label = "a_series"
    remote_repo = Repo()
    remote_coll = remote_repo.create_collection(schema, c_label)
    rseries = remote_coll / s_label

    # Test support of both small dataset (where data is embedded in
    # commits) and large one (arrays are save on their own)
    N = 100_000 if large else 10
    for i in range(10):
        # Create 10 series of size N
        rseries.write({
            "timestamp": range(i, i + N),
            "value": range(i + 100, i + 100 + N),
        })
    nb_items = len(remote_repo.pod.ls())
    if large:
        assert nb_items > 2
    else:
        # for small arrays we have only two folder (one for the repo
        # registry one for the collection)
        assert nb_items == 2
    expected = rseries.frame()

    # Test pull
    local_repo = Repo()
    local_coll = local_repo.create_collection(schema, c_label)
    local_coll.pull(remote_coll)
    lseries = local_coll / s_label
    assert lseries.frame() == expected

    # Test push
    other_repo = Repo()
    other_coll = other_repo.create_collection(schema, c_label)
    remote_coll.push(other_coll)
    oseries = other_coll / s_label
    assert oseries.frame() == expected

    # Test with existing series
    local_repo = Repo()
    local_coll = local_repo.create_collection(schema, c_label)
    local_coll.pull(remote_coll)
    lseries = (
        other_repo.create_collection(schema, c_label, raise_if_exists=False) /
        s_label)
    assert oseries.frame() == expected

    # Test with existing series with existing data
    local_repo = Repo()
    local_coll = local_repo.create_collection(schema, c_label)
    lseries = local_coll / s_label
    frm = Frame(
        schema,
        {
            "timestamp": range(0, 20),
            "value": range(0, 20),
        },
    )
    lseries.write(frm)
    local_coll.pull(remote_coll)
    assert lseries.frame() == frm

    # Test with existing series with other schema
    local_repo = Repo()
    other_schema = Schema(timestamp="int*", value="int")
    local_coll = local_repo.create_collection(other_schema, c_label)
    lseries = local_coll / s_label

    with pytest.raises(ValueError):
        local_repo.pull(remote_repo)
Пример #13
0
def write_lk(df):
    schema = Schema(timestamp="timestamp*", value="float")
    repo = Repo("test-db")
    collection = repo.create_collection(schema, "test")
    series = collection / "test"
    series.write(df)