Python Repo示例，lakota.Repo Python示例

示例#1

0

显示文件

文件： concurrent_test.py 项目： bertrandchenal/lakota

def test_insert(pod):
    # Write with workers
    label = "my_label"
    repo = Repo(pod=pod)
    # Create collection and label
    collection = repo.create_collection(schema, "my_collection")
    token = pod.token
    cluster = LocalCluster(processes=False)
    client = Client(cluster)
    args = [(token, label, y) for y in years]
    with timeit(f"\nWRITE ({pod.protocol})"):
        fut = client.map(insert, args)
        assert sum(client.gather(fut)) == 10_519_200
    client.close()
    cluster.close()

    # Merge everything and read series
    with timeit(f"\nMERGE ({pod.protocol})"):
        collection.merge()

    with timeit(f"\nREAD ({pod.protocol})"):
        series = collection / label
        df = series["2015-01-01":"2015-01-02"].df()
        assert len(df) == 1440
        df = series["2015-12-31":"2016-01-02"].df()
        assert len(df) == 2880

示例#2

0

显示文件

def test_label_regexp():
    repo = Repo()
    ok = ["abc", "abc-abc-123", "abc_abc-123.45", "abc+abc", "$", "é"]
    for label in ok:
        repo.create_collection(SCHEMA, label)
        repo.create_collection(SCHEMA, label.upper(), raise_if_exists=False)

    not_ok = ["", "\t", "\n"]
    for label in not_ok:
        with pytest.raises(ValueError):
            repo.create_collection(SCHEMA, label)
        with pytest.raises(ValueError):
            repo.create_collection(SCHEMA, label + " ")

示例#3

0

显示文件

def test_import_export(repo):
    clct = repo.create_collection(SCHEMA, "test_coll")
    series = clct / "test_series"
    series.write({
        "timestamp": [1, 2, 3],
        "value": [1, 2, 3],
    })

    tmp_pod = MemPOD(".")
    repo.export_collections(tmp_pod)

    repo_bis = Repo("memory://")
    repo_bis.import_collections(tmp_pod)
    frm = repo.collection("test_coll").series("test_series").frame()
    frm_bis = repo_bis.collection("test_coll").series("test_series").frame()
    assert frm == frm_bis

示例#4

0

显示文件

文件： concurrent_test.py 项目： bertrandchenal/lakota

def test_gc():
    # Create pod, repo & collection
    pod = POD.from_uri("memory://")
    token = pod.token
    label = "my_label"
    repo = Repo(pod=pod)
    clc = repo.create_collection(schema, "my_collection")
    # Start cluster & schedule concurrent writes & gc
    cluster = LocalCluster(processes=False)
    client = Client(cluster)
    args = [(token, label, y) for y in years]
    insert_fut = client.map(insert, args)
    gc_fut = client.submit(do_squash_and_gc, token)
    assert sum(client.gather(insert_fut)) == 10_519_200
    client.gather(gc_fut)
    client.close()
    cluster.close()
    # Read data back
    clc.merge()
    frm = clc.series("my_label").frame()
    assert len(frm) == 10_519_200

示例#5

0

显示文件

def test_double_slice(frame_values, frm):
    # in-meory frame
    frm = frm.slice(1, None).slice(None, 2)
    assert all(frm["value"] == VALUES[1:][:2])

    # frame created from repo
    collection = Repo().create_collection(frm.schema, "collection")
    series = collection / "my-label"
    series.write(frame_values)
    frm = series.frame()
    frm = frm.slice(1, None).slice(None, 2)
    assert all(frm["value"] == VALUES[1:][:2])

示例#6

0

显示文件

文件： sync_test.py 项目： bertrandchenal/lakota

def test_series_squash_stability():
    label = "LABEL"
    local_repo = Repo()
    local_coll = local_repo.create_collection(schema, "a_collection")
    remote_repo = Repo()
    remote_coll = remote_repo.create_collection(schema, "a_collection")
    series = local_coll / label

    months = list(range(1, 12))
    delta = timedelta(days=1)
    for start, stop in zip(months[:-1], months[1:]):
        ts = drange(f"2020-{start:02}-01", f"2020-{stop:02}-01", delta)
        values = [start] * len(ts)
        series.write({"timestamp": ts, "value": values})

    local_coll.push(remote_coll)
    local_coll.squash()
    remote_coll.squash()

    local_files = local_coll.pod.walk()
    remote_files = remote_coll.pod.walk()

    local_digests = set(
        Revision.from_path(local_coll.changelog, f).digests
        for f in local_files if "." in f)
    remote_digests = set(
        Revision.from_path(remote_coll.changelog, f).digests
        for f in remote_files if "." in f)
    assert local_digests == remote_digests

示例#7

0

显示文件

文件： sync_test.py 项目： bertrandchenal/lakota

def test_series_shallow_pull(size, direction, shallow):
    label = "LABEL"
    local_repo = Repo()
    remote_repo = Repo()
    local_coll = local_repo.create_collection(schema, "a_collection")
    series = local_coll / label

    series.write({"timestamp": arange(size), "value": arange(size)})
    series.write({"timestamp": arange(size), "value": arange(size) * 2})

    if direction == "pull":
        remote_repo.pull(local_repo, shallow=shallow)
    else:
        local_repo.push(remote_repo, shallow=shallow)

    remote_clc = remote_repo / "a_collection"
    assert len(remote_clc.changelog.log()) == (1 if shallow else 2)

    remote_series = remote_clc / label
    expected = series.frame()
    assert remote_series.frame() == expected

示例#8

0

显示文件

文件： concurrent_test.py 项目： bertrandchenal/lakota

def insert(args):
    token, label, year = args
    pod = POD.from_token(token)
    repo = Repo(pod=pod)
    collection = repo / "my_collection"
    series = collection / label
    ts = date_range(f"{year}-01-01",
                    f"{year+1}-01-01",
                    freq="1min",
                    closed="left")
    df = DataFrame({
        "timestamp":
        ts,
        "value":
        numpy.round(numpy.random.random(len(ts)) * 1000, decimals=0),
    })
    sgm = Frame(schema, df)
    series.write(sgm)
    return len(sgm)

示例#9

0

显示文件

文件： server.py 项目： bertrandchenal/lakota

def run(repo_map, web_uri=None, debug=False):
    parts = urlsplit(web_uri)
    if not parts.scheme == "http":
        # if no scheme is given, hostname and port are not interpreted correctly
        msg = "Incorrect web uri, it should start with 'http://'"
        raise ValueError(msg)

    # Instanciate app and blueprint. Run app
    app = Flask("Lakota Repository")
    prefixes = []
    for name, uri in repo_map.items():
        prefix = parts.path + "/" + name.strip("/")
        repo = Repo(uri)
        app.register_blueprint(pod_bp, url_prefix=prefix, url_defaults={"repo": repo})
        prefixes.append(prefix)

    # Add index page
    app.route("/")(lambda: index(prefixes))
    app.run(parts.hostname, debug=debug, port=parts.port)

示例#10

0

显示文件

文件： sync_test.py 项目： bertrandchenal/lakota

def test_label_delete_push(squash):
    kv_schema = Schema.kv(timestamp="int*", value="float")

    labels = list("abcd")
    local_repo = Repo()
    local_clct = local_repo.create_collection(kv_schema, "a_collection")
    remote_repo = Repo()
    remote_clct = remote_repo.create_collection(kv_schema, "a_collection")

    # Write some data
    frm = {
        "timestamp": [1, 2, 3],
        "value": [1, 2, 3],
    }
    for label in labels:
        series = local_clct / label
        series.write(frm)

    # Create some labels and push them
    local_clct.push(remote_clct)
    if squash:
        remote_clct.squash()
    assert local_clct.ls() == labels
    assert remote_clct.ls() == labels

    # Delete one local label and push again
    local_clct.delete("c")
    local_clct.push(remote_clct)
    if squash:
        remote_clct.merge()
        remote_clct.squash()

    else:
        remote_clct.refresh()

    assert remote_clct.ls() == list("abd")
    assert local_clct.ls() == list("abd")

    # Delete one remote label and pull
    sleep(0.1)  # Needed to avoid concurrent writes
    remote_clct.delete("d")
    local_clct.pull(remote_clct)
    if squash:
        local_clct.squash()
    else:
        local_clct.refresh()
    assert remote_clct.ls() == list("ab")
    assert local_clct.ls() == list("ab")

示例#11

0

显示文件

文件： versioned_timeseries.py 项目： bertrandchenal/lakota

from lakota import Repo, Schema

# TODO use a KVSeries instead (it solve the problem explained at the bottom)

ts_schema = Schema(
    timestamp="timestamp*",
    pubtime="timestamp*",
    value="float",
)
repo = Repo()
clc = repo.create_collection(ts_schema, "my-collection")
srs = clc / "my_series"

# First insertion
df = {
    "timestamp": [
        "2020-01-01T00:00",
        "2020-01-01T00:00",
        "2020-01-02T00:00",
        "2020-01-02T00:00",
        "2020-01-03T00:00",
        "2020-01-03T00:00",
        "2020-01-04T00:00",
        "2020-01-04T00:00",
    ],
    "pubtime": [
        "2020-01-01T00:00",
        "2020-01-02T00:00",
        "2020-01-01T00:00",
        "2020-01-02T00:00",
        "2020-01-01T00:00",

示例#12

0

显示文件

文件： bench_fragmentation.py 项目： bertrandchenal/lakota

CHUNK_SIZES = (500, 5_000, 50_000, 500_000)


def create_df(start, stop):
    ts = arange(start, stop)
    value = arange(start, stop)
    random.shuffle(value)

    return DataFrame({"timestamp": ts, "value": value})


def call(cmd):
    proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
    stdout, _ = proc.communicate()
    return stdout


for chunk_size in CHUNK_SIZES:
    df = create_df(0, SIZE)
    with timeit(f"chunk size {chunk_size}:"):
        schema = Schema(timestamp="timestamp*", value="float")
        repo = Repo("test-db")
        collection = repo.create_collection(schema, "test")
        series = collection / "test"
        for i in range(0, SIZE, chunk_size):
            series.write(df[i : i + chunk_size])
    res = call("du -hs test-db")
    print("Disk use", res.split()[0].decode())
    call("rm -r test-db")

示例#13

0

显示文件

文件： app.py 项目： bertrandchenal/lakota-lambda

from pathlib import Path

from chalice import Chalice, Response
from chalice import BadRequestError
from lakota import Repo
from lakota.utils import logger
from jinja2 import Environment, FileSystemLoader
from numpy import asarray, char
from numpy.core.defchararray import find

logger.setLevel('CRITICAL')
title = os.environ.get('APP_TITLE', 'Lakota')
uri = os.environ.get('LAKOTA_REPO', '.lakota')
app_prefix = os.environ.get('APP_PREFIX', '')
static_prefix = 'static'
repo = Repo(['/tmp/lakota-cache', uri])

PAGE_LEN = 20_000
lib_path = Path(__file__).parent / 'chalicelib'
tpl_path = lib_path / 'template'
static_path = lib_path / 'static'
env = Environment(loader=FileSystemLoader([tpl_path]))
app = Chalice(app_name='lakota-lambda')
app.api.binary_types.extend(['application/json'])

uplot_options = {
    # 'title': '',
    # 'id': '',
    # 'class': '',
    'width':
    900,

示例#14

0

显示文件

文件： bench_parallel.py 项目： bertrandchenal/lakota

schema = Schema(key="int*", **{x: "float" for x in cols})
frm = {
    "key": range(SIZE),
}
for x in cols:
    frm[x] = sin(arange(SIZE))


# Simulate network lag
def lag(fn, delay):
    def wrapper(*a, **kw):
        sleep(delay)
        return fn(*a, **kw)

    return wrapper


mempod_write = MemPOD.write

for delay in (0.001, 0.01, 0.1):
    MemPOD.write = lag(mempod_write, delay)
    for threaded in (False, True):
        settings.threaded = threaded
        with timeit(f"{delay}-{threaded}"):
            repo = Repo()
            clc = repo.create_collection(schema, "clc")
            with clc.multi():
                for name in "ABC":
                    series = clc / name
                    series.write(frm)

示例#15

0

显示文件

def read_lk():
    repo = Repo("test-db")
    collection = repo / "test"
    series = collection / "test"
    return series.frame()

示例#16

0

显示文件

def repo():
    return Repo(pod=MemPOD("."))

示例#17

0

显示文件

文件： sync_test.py 项目： bertrandchenal/lakota

def test_pull(threaded, large):
    c_label = "a_collection"
    s_label = "a_series"
    remote_repo = Repo()
    remote_coll = remote_repo.create_collection(schema, c_label)
    rseries = remote_coll / s_label

    # Test support of both small dataset (where data is embedded in
    # commits) and large one (arrays are save on their own)
    N = 100_000 if large else 10
    for i in range(10):
        # Create 10 series of size N
        rseries.write({
            "timestamp": range(i, i + N),
            "value": range(i + 100, i + 100 + N),
        })
    nb_items = len(remote_repo.pod.ls())
    if large:
        assert nb_items > 2
    else:
        # for small arrays we have only two folder (one for the repo
        # registry one for the collection)
        assert nb_items == 2
    expected = rseries.frame()

    # Test pull
    local_repo = Repo()
    local_coll = local_repo.create_collection(schema, c_label)
    local_coll.pull(remote_coll)
    lseries = local_coll / s_label
    assert lseries.frame() == expected

    # Test push
    other_repo = Repo()
    other_coll = other_repo.create_collection(schema, c_label)
    remote_coll.push(other_coll)
    oseries = other_coll / s_label
    assert oseries.frame() == expected

    # Test with existing series
    local_repo = Repo()
    local_coll = local_repo.create_collection(schema, c_label)
    local_coll.pull(remote_coll)
    lseries = (
        other_repo.create_collection(schema, c_label, raise_if_exists=False) /
        s_label)
    assert oseries.frame() == expected

    # Test with existing series with existing data
    local_repo = Repo()
    local_coll = local_repo.create_collection(schema, c_label)
    lseries = local_coll / s_label
    frm = Frame(
        schema,
        {
            "timestamp": range(0, 20),
            "value": range(0, 20),
        },
    )
    lseries.write(frm)
    local_coll.pull(remote_coll)
    assert lseries.frame() == frm

    # Test with existing series with other schema
    local_repo = Repo()
    other_schema = Schema(timestamp="int*", value="int")
    local_coll = local_repo.create_collection(other_schema, c_label)
    lseries = local_coll / s_label

    with pytest.raises(ValueError):
        local_repo.pull(remote_repo)

示例#18

0

显示文件

文件： concurrent_test.py 项目： bertrandchenal/lakota

def do_squash_and_gc(token):
    pod = POD.from_token(token)
    for i in range(10):
        repo = Repo(pod=pod)
        repo.gc()
        sleep(0.05)

示例#19

0

显示文件

def test_refresh():
    pod = MemPOD(".")
    repo = Repo(pod=pod)

    repo.create_collection(SCHEMA, "collection")
    assert repo.ls() == ["collection"]
    repo2 = Repo(pod=pod)
    repo2.delete("collection")
    # repo is out of sync
    assert repo.ls() == ["collection"]
    # refresh slove ths
    repo.refresh()
    assert repo.ls() == []

示例#20

0

显示文件

文件： series_test.py 项目： bertrandchenal/lakota

def repo():
    return Repo("memory://")

示例#21

0

显示文件

SIZE = 100_000
values = random(SIZE)
timestamps = date_range("1970-01-01", freq="5min", periods=SIZE)
df = DataFrame({
    "ts": timestamps,
    "value": values,
})

df.to_csv(f"timeseries-{suffix}.csv")
df.to_parquet(f"timeseries-{suffix}.snappy.pqt", compression='snappy')
df.to_parquet(f"timeseries-{suffix}.brotli.pqt", compression='brotli')
with timeit('pqt'):
    df.to_parquet(f"timeseries-{suffix}.gzip.pqt", compression='gzip')

repo = Repo("repo")
schema = Schema(ts="timestamp*", value="float")
clct = repo / "my_collection"
if not clct:
    clct = repo.create_collection(schema, "my_collection")
series = clct / "my_series"

with timeit('lk'):
    series.write(df)

## Results

# $ python examples/data_size.py
# pqt 198.76ms
# lk 24.24ms

示例#22

0

显示文件

def write_lk(df):
    schema = Schema(timestamp="timestamp*", value="float")
    repo = Repo("test-db")
    collection = repo.create_collection(schema, "test")
    series = collection / "test"
    series.write(df)