Пример #1
0
def test_insert(pod):
    # Write with workers
    label = "my_label"
    repo = Repo(pod=pod)
    # Create collection and label
    collection = repo.create_collection(schema, "my_collection")
    token = pod.token
    cluster = LocalCluster(processes=False)
    client = Client(cluster)
    args = [(token, label, y) for y in years]
    with timeit(f"\nWRITE ({pod.protocol})"):
        fut = client.map(insert, args)
        assert sum(client.gather(fut)) == 10_519_200
    client.close()
    cluster.close()

    # Merge everything and read series
    with timeit(f"\nMERGE ({pod.protocol})"):
        collection.merge()

    with timeit(f"\nREAD ({pod.protocol})"):
        series = collection / label
        df = series["2015-01-01":"2015-01-02"].df()
        assert len(df) == 1440
        df = series["2015-12-31":"2016-01-02"].df()
        assert len(df) == 2880
Пример #2
0
def test_label_regexp():
    repo = Repo()
    ok = ["abc", "abc-abc-123", "abc_abc-123.45", "abc+abc", "$", "é"]
    for label in ok:
        repo.create_collection(SCHEMA, label)
        repo.create_collection(SCHEMA, label.upper(), raise_if_exists=False)

    not_ok = ["", "\t", "\n"]
    for label in not_ok:
        with pytest.raises(ValueError):
            repo.create_collection(SCHEMA, label)
        with pytest.raises(ValueError):
            repo.create_collection(SCHEMA, label + " ")
Пример #3
0
def test_import_export(repo):
    clct = repo.create_collection(SCHEMA, "test_coll")
    series = clct / "test_series"
    series.write({
        "timestamp": [1, 2, 3],
        "value": [1, 2, 3],
    })

    tmp_pod = MemPOD(".")
    repo.export_collections(tmp_pod)

    repo_bis = Repo("memory://")
    repo_bis.import_collections(tmp_pod)
    frm = repo.collection("test_coll").series("test_series").frame()
    frm_bis = repo_bis.collection("test_coll").series("test_series").frame()
    assert frm == frm_bis
Пример #4
0
def test_gc():
    # Create pod, repo & collection
    pod = POD.from_uri("memory://")
    token = pod.token
    label = "my_label"
    repo = Repo(pod=pod)
    clc = repo.create_collection(schema, "my_collection")
    # Start cluster & schedule concurrent writes & gc
    cluster = LocalCluster(processes=False)
    client = Client(cluster)
    args = [(token, label, y) for y in years]
    insert_fut = client.map(insert, args)
    gc_fut = client.submit(do_squash_and_gc, token)
    assert sum(client.gather(insert_fut)) == 10_519_200
    client.gather(gc_fut)
    client.close()
    cluster.close()
    # Read data back
    clc.merge()
    frm = clc.series("my_label").frame()
    assert len(frm) == 10_519_200
Пример #5
0
def test_double_slice(frame_values, frm):
    # in-meory frame
    frm = frm.slice(1, None).slice(None, 2)
    assert all(frm["value"] == VALUES[1:][:2])

    # frame created from repo
    collection = Repo().create_collection(frm.schema, "collection")
    series = collection / "my-label"
    series.write(frame_values)
    frm = series.frame()
    frm = frm.slice(1, None).slice(None, 2)
    assert all(frm["value"] == VALUES[1:][:2])
Пример #6
0
def test_series_squash_stability():
    label = "LABEL"
    local_repo = Repo()
    local_coll = local_repo.create_collection(schema, "a_collection")
    remote_repo = Repo()
    remote_coll = remote_repo.create_collection(schema, "a_collection")
    series = local_coll / label

    months = list(range(1, 12))
    delta = timedelta(days=1)
    for start, stop in zip(months[:-1], months[1:]):
        ts = drange(f"2020-{start:02}-01", f"2020-{stop:02}-01", delta)
        values = [start] * len(ts)
        series.write({"timestamp": ts, "value": values})

    local_coll.push(remote_coll)
    local_coll.squash()
    remote_coll.squash()

    local_files = local_coll.pod.walk()
    remote_files = remote_coll.pod.walk()

    local_digests = set(
        Revision.from_path(local_coll.changelog, f).digests
        for f in local_files if "." in f)
    remote_digests = set(
        Revision.from_path(remote_coll.changelog, f).digests
        for f in remote_files if "." in f)
    assert local_digests == remote_digests
Пример #7
0
def test_series_shallow_pull(size, direction, shallow):
    label = "LABEL"
    local_repo = Repo()
    remote_repo = Repo()
    local_coll = local_repo.create_collection(schema, "a_collection")
    series = local_coll / label

    series.write({"timestamp": arange(size), "value": arange(size)})
    series.write({"timestamp": arange(size), "value": arange(size) * 2})

    if direction == "pull":
        remote_repo.pull(local_repo, shallow=shallow)
    else:
        local_repo.push(remote_repo, shallow=shallow)

    remote_clc = remote_repo / "a_collection"
    assert len(remote_clc.changelog.log()) == (1 if shallow else 2)

    remote_series = remote_clc / label
    expected = series.frame()
    assert remote_series.frame() == expected
Пример #8
0
def insert(args):
    token, label, year = args
    pod = POD.from_token(token)
    repo = Repo(pod=pod)
    collection = repo / "my_collection"
    series = collection / label
    ts = date_range(f"{year}-01-01",
                    f"{year+1}-01-01",
                    freq="1min",
                    closed="left")
    df = DataFrame({
        "timestamp":
        ts,
        "value":
        numpy.round(numpy.random.random(len(ts)) * 1000, decimals=0),
    })
    sgm = Frame(schema, df)
    series.write(sgm)
    return len(sgm)
Пример #9
0
def run(repo_map, web_uri=None, debug=False):
    parts = urlsplit(web_uri)
    if not parts.scheme == "http":
        # if no scheme is given, hostname and port are not interpreted correctly
        msg = "Incorrect web uri, it should start with 'http://'"
        raise ValueError(msg)

    # Instanciate app and blueprint. Run app
    app = Flask("Lakota Repository")
    prefixes = []
    for name, uri in repo_map.items():
        prefix = parts.path + "/" + name.strip("/")
        repo = Repo(uri)
        app.register_blueprint(pod_bp, url_prefix=prefix, url_defaults={"repo": repo})
        prefixes.append(prefix)

    # Add index page
    app.route("/")(lambda: index(prefixes))
    app.run(parts.hostname, debug=debug, port=parts.port)
Пример #10
0
def test_label_delete_push(squash):
    kv_schema = Schema.kv(timestamp="int*", value="float")

    labels = list("abcd")
    local_repo = Repo()
    local_clct = local_repo.create_collection(kv_schema, "a_collection")
    remote_repo = Repo()
    remote_clct = remote_repo.create_collection(kv_schema, "a_collection")

    # Write some data
    frm = {
        "timestamp": [1, 2, 3],
        "value": [1, 2, 3],
    }
    for label in labels:
        series = local_clct / label
        series.write(frm)

    # Create some labels and push them
    local_clct.push(remote_clct)
    if squash:
        remote_clct.squash()
    assert local_clct.ls() == labels
    assert remote_clct.ls() == labels

    # Delete one local label and push again
    local_clct.delete("c")
    local_clct.push(remote_clct)
    if squash:
        remote_clct.merge()
        remote_clct.squash()

    else:
        remote_clct.refresh()

    assert remote_clct.ls() == list("abd")
    assert local_clct.ls() == list("abd")

    # Delete one remote label and pull
    sleep(0.1)  # Needed to avoid concurrent writes
    remote_clct.delete("d")
    local_clct.pull(remote_clct)
    if squash:
        local_clct.squash()
    else:
        local_clct.refresh()
    assert remote_clct.ls() == list("ab")
    assert local_clct.ls() == list("ab")
Пример #11
0
from lakota import Repo, Schema

# TODO use a KVSeries instead (it solve the problem explained at the bottom)

ts_schema = Schema(
    timestamp="timestamp*",
    pubtime="timestamp*",
    value="float",
)
repo = Repo()
clc = repo.create_collection(ts_schema, "my-collection")
srs = clc / "my_series"

# First insertion
df = {
    "timestamp": [
        "2020-01-01T00:00",
        "2020-01-01T00:00",
        "2020-01-02T00:00",
        "2020-01-02T00:00",
        "2020-01-03T00:00",
        "2020-01-03T00:00",
        "2020-01-04T00:00",
        "2020-01-04T00:00",
    ],
    "pubtime": [
        "2020-01-01T00:00",
        "2020-01-02T00:00",
        "2020-01-01T00:00",
        "2020-01-02T00:00",
        "2020-01-01T00:00",
Пример #12
0
CHUNK_SIZES = (500, 5_000, 50_000, 500_000)


def create_df(start, stop):
    ts = arange(start, stop)
    value = arange(start, stop)
    random.shuffle(value)

    return DataFrame({"timestamp": ts, "value": value})


def call(cmd):
    proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
    stdout, _ = proc.communicate()
    return stdout


for chunk_size in CHUNK_SIZES:
    df = create_df(0, SIZE)
    with timeit(f"chunk size {chunk_size}:"):
        schema = Schema(timestamp="timestamp*", value="float")
        repo = Repo("test-db")
        collection = repo.create_collection(schema, "test")
        series = collection / "test"
        for i in range(0, SIZE, chunk_size):
            series.write(df[i : i + chunk_size])
    res = call("du -hs test-db")
    print("Disk use", res.split()[0].decode())
    call("rm -r test-db")
Пример #13
0
from pathlib import Path

from chalice import Chalice, Response
from chalice import BadRequestError
from lakota import Repo
from lakota.utils import logger
from jinja2 import Environment, FileSystemLoader
from numpy import asarray, char
from numpy.core.defchararray import find

logger.setLevel('CRITICAL')
title = os.environ.get('APP_TITLE', 'Lakota')
uri = os.environ.get('LAKOTA_REPO', '.lakota')
app_prefix = os.environ.get('APP_PREFIX', '')
static_prefix = 'static'
repo = Repo(['/tmp/lakota-cache', uri])

PAGE_LEN = 20_000
lib_path = Path(__file__).parent / 'chalicelib'
tpl_path = lib_path / 'template'
static_path = lib_path / 'static'
env = Environment(loader=FileSystemLoader([tpl_path]))
app = Chalice(app_name='lakota-lambda')
app.api.binary_types.extend(['application/json'])

uplot_options = {
    # 'title': '',
    # 'id': '',
    # 'class': '',
    'width':
    900,
Пример #14
0
schema = Schema(key="int*", **{x: "float" for x in cols})
frm = {
    "key": range(SIZE),
}
for x in cols:
    frm[x] = sin(arange(SIZE))


# Simulate network lag
def lag(fn, delay):
    def wrapper(*a, **kw):
        sleep(delay)
        return fn(*a, **kw)

    return wrapper


mempod_write = MemPOD.write

for delay in (0.001, 0.01, 0.1):
    MemPOD.write = lag(mempod_write, delay)
    for threaded in (False, True):
        settings.threaded = threaded
        with timeit(f"{delay}-{threaded}"):
            repo = Repo()
            clc = repo.create_collection(schema, "clc")
            with clc.multi():
                for name in "ABC":
                    series = clc / name
                    series.write(frm)
Пример #15
0
def read_lk():
    repo = Repo("test-db")
    collection = repo / "test"
    series = collection / "test"
    return series.frame()
Пример #16
0
def repo():
    return Repo(pod=MemPOD("."))
Пример #17
0
def test_pull(threaded, large):
    c_label = "a_collection"
    s_label = "a_series"
    remote_repo = Repo()
    remote_coll = remote_repo.create_collection(schema, c_label)
    rseries = remote_coll / s_label

    # Test support of both small dataset (where data is embedded in
    # commits) and large one (arrays are save on their own)
    N = 100_000 if large else 10
    for i in range(10):
        # Create 10 series of size N
        rseries.write({
            "timestamp": range(i, i + N),
            "value": range(i + 100, i + 100 + N),
        })
    nb_items = len(remote_repo.pod.ls())
    if large:
        assert nb_items > 2
    else:
        # for small arrays we have only two folder (one for the repo
        # registry one for the collection)
        assert nb_items == 2
    expected = rseries.frame()

    # Test pull
    local_repo = Repo()
    local_coll = local_repo.create_collection(schema, c_label)
    local_coll.pull(remote_coll)
    lseries = local_coll / s_label
    assert lseries.frame() == expected

    # Test push
    other_repo = Repo()
    other_coll = other_repo.create_collection(schema, c_label)
    remote_coll.push(other_coll)
    oseries = other_coll / s_label
    assert oseries.frame() == expected

    # Test with existing series
    local_repo = Repo()
    local_coll = local_repo.create_collection(schema, c_label)
    local_coll.pull(remote_coll)
    lseries = (
        other_repo.create_collection(schema, c_label, raise_if_exists=False) /
        s_label)
    assert oseries.frame() == expected

    # Test with existing series with existing data
    local_repo = Repo()
    local_coll = local_repo.create_collection(schema, c_label)
    lseries = local_coll / s_label
    frm = Frame(
        schema,
        {
            "timestamp": range(0, 20),
            "value": range(0, 20),
        },
    )
    lseries.write(frm)
    local_coll.pull(remote_coll)
    assert lseries.frame() == frm

    # Test with existing series with other schema
    local_repo = Repo()
    other_schema = Schema(timestamp="int*", value="int")
    local_coll = local_repo.create_collection(other_schema, c_label)
    lseries = local_coll / s_label

    with pytest.raises(ValueError):
        local_repo.pull(remote_repo)
Пример #18
0
def do_squash_and_gc(token):
    pod = POD.from_token(token)
    for i in range(10):
        repo = Repo(pod=pod)
        repo.gc()
        sleep(0.05)
Пример #19
0
def test_refresh():
    pod = MemPOD(".")
    repo = Repo(pod=pod)

    repo.create_collection(SCHEMA, "collection")
    assert repo.ls() == ["collection"]
    repo2 = Repo(pod=pod)
    repo2.delete("collection")
    # repo is out of sync
    assert repo.ls() == ["collection"]
    # refresh slove ths
    repo.refresh()
    assert repo.ls() == []
Пример #20
0
def repo():
    return Repo("memory://")
Пример #21
0
SIZE = 100_000
values = random(SIZE)
timestamps = date_range("1970-01-01", freq="5min", periods=SIZE)
df = DataFrame({
    "ts": timestamps,
    "value": values,
})

df.to_csv(f"timeseries-{suffix}.csv")
df.to_parquet(f"timeseries-{suffix}.snappy.pqt", compression='snappy')
df.to_parquet(f"timeseries-{suffix}.brotli.pqt", compression='brotli')
with timeit('pqt'):
    df.to_parquet(f"timeseries-{suffix}.gzip.pqt", compression='gzip')

repo = Repo("repo")
schema = Schema(ts="timestamp*", value="float")
clct = repo / "my_collection"
if not clct:
    clct = repo.create_collection(schema, "my_collection")
series = clct / "my_series"

with timeit('lk'):
    series.write(df)

## Results

# $ python examples/data_size.py
# pqt 198.76ms
# lk 24.24ms
Пример #22
0
def write_lk(df):
    schema = Schema(timestamp="timestamp*", value="float")
    repo = Repo("test-db")
    collection = repo.create_collection(schema, "test")
    series = collection / "test"
    series.write(df)