def test_insert(pod): # Write with workers label = "my_label" repo = Repo(pod=pod) # Create collection and label collection = repo.create_collection(schema, "my_collection") token = pod.token cluster = LocalCluster(processes=False) client = Client(cluster) args = [(token, label, y) for y in years] with timeit(f"\nWRITE ({pod.protocol})"): fut = client.map(insert, args) assert sum(client.gather(fut)) == 10_519_200 client.close() cluster.close() # Merge everything and read series with timeit(f"\nMERGE ({pod.protocol})"): collection.merge() with timeit(f"\nREAD ({pod.protocol})"): series = collection / label df = series["2015-01-01":"2015-01-02"].df() assert len(df) == 1440 df = series["2015-12-31":"2016-01-02"].df() assert len(df) == 2880
def test_label_regexp(): repo = Repo() ok = ["abc", "abc-abc-123", "abc_abc-123.45", "abc+abc", "$", "é"] for label in ok: repo.create_collection(SCHEMA, label) repo.create_collection(SCHEMA, label.upper(), raise_if_exists=False) not_ok = ["", "\t", "\n"] for label in not_ok: with pytest.raises(ValueError): repo.create_collection(SCHEMA, label) with pytest.raises(ValueError): repo.create_collection(SCHEMA, label + " ")
def test_import_export(repo): clct = repo.create_collection(SCHEMA, "test_coll") series = clct / "test_series" series.write({ "timestamp": [1, 2, 3], "value": [1, 2, 3], }) tmp_pod = MemPOD(".") repo.export_collections(tmp_pod) repo_bis = Repo("memory://") repo_bis.import_collections(tmp_pod) frm = repo.collection("test_coll").series("test_series").frame() frm_bis = repo_bis.collection("test_coll").series("test_series").frame() assert frm == frm_bis
def test_gc(): # Create pod, repo & collection pod = POD.from_uri("memory://") token = pod.token label = "my_label" repo = Repo(pod=pod) clc = repo.create_collection(schema, "my_collection") # Start cluster & schedule concurrent writes & gc cluster = LocalCluster(processes=False) client = Client(cluster) args = [(token, label, y) for y in years] insert_fut = client.map(insert, args) gc_fut = client.submit(do_squash_and_gc, token) assert sum(client.gather(insert_fut)) == 10_519_200 client.gather(gc_fut) client.close() cluster.close() # Read data back clc.merge() frm = clc.series("my_label").frame() assert len(frm) == 10_519_200
def test_double_slice(frame_values, frm): # in-meory frame frm = frm.slice(1, None).slice(None, 2) assert all(frm["value"] == VALUES[1:][:2]) # frame created from repo collection = Repo().create_collection(frm.schema, "collection") series = collection / "my-label" series.write(frame_values) frm = series.frame() frm = frm.slice(1, None).slice(None, 2) assert all(frm["value"] == VALUES[1:][:2])
def test_series_squash_stability(): label = "LABEL" local_repo = Repo() local_coll = local_repo.create_collection(schema, "a_collection") remote_repo = Repo() remote_coll = remote_repo.create_collection(schema, "a_collection") series = local_coll / label months = list(range(1, 12)) delta = timedelta(days=1) for start, stop in zip(months[:-1], months[1:]): ts = drange(f"2020-{start:02}-01", f"2020-{stop:02}-01", delta) values = [start] * len(ts) series.write({"timestamp": ts, "value": values}) local_coll.push(remote_coll) local_coll.squash() remote_coll.squash() local_files = local_coll.pod.walk() remote_files = remote_coll.pod.walk() local_digests = set( Revision.from_path(local_coll.changelog, f).digests for f in local_files if "." in f) remote_digests = set( Revision.from_path(remote_coll.changelog, f).digests for f in remote_files if "." in f) assert local_digests == remote_digests
def test_series_shallow_pull(size, direction, shallow): label = "LABEL" local_repo = Repo() remote_repo = Repo() local_coll = local_repo.create_collection(schema, "a_collection") series = local_coll / label series.write({"timestamp": arange(size), "value": arange(size)}) series.write({"timestamp": arange(size), "value": arange(size) * 2}) if direction == "pull": remote_repo.pull(local_repo, shallow=shallow) else: local_repo.push(remote_repo, shallow=shallow) remote_clc = remote_repo / "a_collection" assert len(remote_clc.changelog.log()) == (1 if shallow else 2) remote_series = remote_clc / label expected = series.frame() assert remote_series.frame() == expected
def insert(args): token, label, year = args pod = POD.from_token(token) repo = Repo(pod=pod) collection = repo / "my_collection" series = collection / label ts = date_range(f"{year}-01-01", f"{year+1}-01-01", freq="1min", closed="left") df = DataFrame({ "timestamp": ts, "value": numpy.round(numpy.random.random(len(ts)) * 1000, decimals=0), }) sgm = Frame(schema, df) series.write(sgm) return len(sgm)
def run(repo_map, web_uri=None, debug=False): parts = urlsplit(web_uri) if not parts.scheme == "http": # if no scheme is given, hostname and port are not interpreted correctly msg = "Incorrect web uri, it should start with 'http://'" raise ValueError(msg) # Instanciate app and blueprint. Run app app = Flask("Lakota Repository") prefixes = [] for name, uri in repo_map.items(): prefix = parts.path + "/" + name.strip("/") repo = Repo(uri) app.register_blueprint(pod_bp, url_prefix=prefix, url_defaults={"repo": repo}) prefixes.append(prefix) # Add index page app.route("/")(lambda: index(prefixes)) app.run(parts.hostname, debug=debug, port=parts.port)
def test_label_delete_push(squash): kv_schema = Schema.kv(timestamp="int*", value="float") labels = list("abcd") local_repo = Repo() local_clct = local_repo.create_collection(kv_schema, "a_collection") remote_repo = Repo() remote_clct = remote_repo.create_collection(kv_schema, "a_collection") # Write some data frm = { "timestamp": [1, 2, 3], "value": [1, 2, 3], } for label in labels: series = local_clct / label series.write(frm) # Create some labels and push them local_clct.push(remote_clct) if squash: remote_clct.squash() assert local_clct.ls() == labels assert remote_clct.ls() == labels # Delete one local label and push again local_clct.delete("c") local_clct.push(remote_clct) if squash: remote_clct.merge() remote_clct.squash() else: remote_clct.refresh() assert remote_clct.ls() == list("abd") assert local_clct.ls() == list("abd") # Delete one remote label and pull sleep(0.1) # Needed to avoid concurrent writes remote_clct.delete("d") local_clct.pull(remote_clct) if squash: local_clct.squash() else: local_clct.refresh() assert remote_clct.ls() == list("ab") assert local_clct.ls() == list("ab")
from lakota import Repo, Schema # TODO use a KVSeries instead (it solve the problem explained at the bottom) ts_schema = Schema( timestamp="timestamp*", pubtime="timestamp*", value="float", ) repo = Repo() clc = repo.create_collection(ts_schema, "my-collection") srs = clc / "my_series" # First insertion df = { "timestamp": [ "2020-01-01T00:00", "2020-01-01T00:00", "2020-01-02T00:00", "2020-01-02T00:00", "2020-01-03T00:00", "2020-01-03T00:00", "2020-01-04T00:00", "2020-01-04T00:00", ], "pubtime": [ "2020-01-01T00:00", "2020-01-02T00:00", "2020-01-01T00:00", "2020-01-02T00:00", "2020-01-01T00:00",
CHUNK_SIZES = (500, 5_000, 50_000, 500_000) def create_df(start, stop): ts = arange(start, stop) value = arange(start, stop) random.shuffle(value) return DataFrame({"timestamp": ts, "value": value}) def call(cmd): proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) stdout, _ = proc.communicate() return stdout for chunk_size in CHUNK_SIZES: df = create_df(0, SIZE) with timeit(f"chunk size {chunk_size}:"): schema = Schema(timestamp="timestamp*", value="float") repo = Repo("test-db") collection = repo.create_collection(schema, "test") series = collection / "test" for i in range(0, SIZE, chunk_size): series.write(df[i : i + chunk_size]) res = call("du -hs test-db") print("Disk use", res.split()[0].decode()) call("rm -r test-db")
from pathlib import Path from chalice import Chalice, Response from chalice import BadRequestError from lakota import Repo from lakota.utils import logger from jinja2 import Environment, FileSystemLoader from numpy import asarray, char from numpy.core.defchararray import find logger.setLevel('CRITICAL') title = os.environ.get('APP_TITLE', 'Lakota') uri = os.environ.get('LAKOTA_REPO', '.lakota') app_prefix = os.environ.get('APP_PREFIX', '') static_prefix = 'static' repo = Repo(['/tmp/lakota-cache', uri]) PAGE_LEN = 20_000 lib_path = Path(__file__).parent / 'chalicelib' tpl_path = lib_path / 'template' static_path = lib_path / 'static' env = Environment(loader=FileSystemLoader([tpl_path])) app = Chalice(app_name='lakota-lambda') app.api.binary_types.extend(['application/json']) uplot_options = { # 'title': '', # 'id': '', # 'class': '', 'width': 900,
schema = Schema(key="int*", **{x: "float" for x in cols}) frm = { "key": range(SIZE), } for x in cols: frm[x] = sin(arange(SIZE)) # Simulate network lag def lag(fn, delay): def wrapper(*a, **kw): sleep(delay) return fn(*a, **kw) return wrapper mempod_write = MemPOD.write for delay in (0.001, 0.01, 0.1): MemPOD.write = lag(mempod_write, delay) for threaded in (False, True): settings.threaded = threaded with timeit(f"{delay}-{threaded}"): repo = Repo() clc = repo.create_collection(schema, "clc") with clc.multi(): for name in "ABC": series = clc / name series.write(frm)
def read_lk(): repo = Repo("test-db") collection = repo / "test" series = collection / "test" return series.frame()
def repo(): return Repo(pod=MemPOD("."))
def test_pull(threaded, large): c_label = "a_collection" s_label = "a_series" remote_repo = Repo() remote_coll = remote_repo.create_collection(schema, c_label) rseries = remote_coll / s_label # Test support of both small dataset (where data is embedded in # commits) and large one (arrays are save on their own) N = 100_000 if large else 10 for i in range(10): # Create 10 series of size N rseries.write({ "timestamp": range(i, i + N), "value": range(i + 100, i + 100 + N), }) nb_items = len(remote_repo.pod.ls()) if large: assert nb_items > 2 else: # for small arrays we have only two folder (one for the repo # registry one for the collection) assert nb_items == 2 expected = rseries.frame() # Test pull local_repo = Repo() local_coll = local_repo.create_collection(schema, c_label) local_coll.pull(remote_coll) lseries = local_coll / s_label assert lseries.frame() == expected # Test push other_repo = Repo() other_coll = other_repo.create_collection(schema, c_label) remote_coll.push(other_coll) oseries = other_coll / s_label assert oseries.frame() == expected # Test with existing series local_repo = Repo() local_coll = local_repo.create_collection(schema, c_label) local_coll.pull(remote_coll) lseries = ( other_repo.create_collection(schema, c_label, raise_if_exists=False) / s_label) assert oseries.frame() == expected # Test with existing series with existing data local_repo = Repo() local_coll = local_repo.create_collection(schema, c_label) lseries = local_coll / s_label frm = Frame( schema, { "timestamp": range(0, 20), "value": range(0, 20), }, ) lseries.write(frm) local_coll.pull(remote_coll) assert lseries.frame() == frm # Test with existing series with other schema local_repo = Repo() other_schema = Schema(timestamp="int*", value="int") local_coll = local_repo.create_collection(other_schema, c_label) lseries = local_coll / s_label with pytest.raises(ValueError): local_repo.pull(remote_repo)
def do_squash_and_gc(token): pod = POD.from_token(token) for i in range(10): repo = Repo(pod=pod) repo.gc() sleep(0.05)
def test_refresh(): pod = MemPOD(".") repo = Repo(pod=pod) repo.create_collection(SCHEMA, "collection") assert repo.ls() == ["collection"] repo2 = Repo(pod=pod) repo2.delete("collection") # repo is out of sync assert repo.ls() == ["collection"] # refresh slove ths repo.refresh() assert repo.ls() == []
def repo(): return Repo("memory://")
SIZE = 100_000 values = random(SIZE) timestamps = date_range("1970-01-01", freq="5min", periods=SIZE) df = DataFrame({ "ts": timestamps, "value": values, }) df.to_csv(f"timeseries-{suffix}.csv") df.to_parquet(f"timeseries-{suffix}.snappy.pqt", compression='snappy') df.to_parquet(f"timeseries-{suffix}.brotli.pqt", compression='brotli') with timeit('pqt'): df.to_parquet(f"timeseries-{suffix}.gzip.pqt", compression='gzip') repo = Repo("repo") schema = Schema(ts="timestamp*", value="float") clct = repo / "my_collection" if not clct: clct = repo.create_collection(schema, "my_collection") series = clct / "my_series" with timeit('lk'): series.write(df) ## Results # $ python examples/data_size.py # pqt 198.76ms # lk 24.24ms
def write_lk(df): schema = Schema(timestamp="timestamp*", value="float") repo = Repo("test-db") collection = repo.create_collection(schema, "test") series = collection / "test" series.write(df)