Exemplo n.º 1
0
def test_insert(pod):
    # Write with workers
    label = "my_label"
    repo = Repo(pod=pod)
    # Create collection and label
    collection = repo.create_collection(schema, "my_collection")
    token = pod.token
    cluster = LocalCluster(processes=False)
    client = Client(cluster)
    args = [(token, label, y) for y in years]
    with timeit(f"\nWRITE ({pod.protocol})"):
        fut = client.map(insert, args)
        assert sum(client.gather(fut)) == 10_519_200
    client.close()
    cluster.close()

    # Merge everything and read series
    with timeit(f"\nMERGE ({pod.protocol})"):
        collection.merge()

    with timeit(f"\nREAD ({pod.protocol})"):
        series = collection / label
        df = series["2015-01-01":"2015-01-02"].df()
        assert len(df) == 1440
        df = series["2015-12-31":"2016-01-02"].df()
        assert len(df) == 2880
Exemplo n.º 2
0
from lakota.utils import hextime, timeit

suffix = hextime()

SIZE = 100_000
values = random(SIZE)
timestamps = date_range("1970-01-01", freq="5min", periods=SIZE)
df = DataFrame({
    "ts": timestamps,
    "value": values,
})

df.to_csv(f"timeseries-{suffix}.csv")
df.to_parquet(f"timeseries-{suffix}.snappy.pqt", compression='snappy')
df.to_parquet(f"timeseries-{suffix}.brotli.pqt", compression='brotli')
with timeit('pqt'):
    df.to_parquet(f"timeseries-{suffix}.gzip.pqt", compression='gzip')

repo = Repo("repo")
schema = Schema(ts="timestamp*", value="float")
clct = repo / "my_collection"
if not clct:
    clct = repo.create_collection(schema, "my_collection")
series = clct / "my_series"

with timeit('lk'):
    series.write(df)

## Results

# $ python examples/data_size.py
Exemplo n.º 3
0
CHUNK_SIZES = (500, 5_000, 50_000, 500_000)


def create_df(start, stop):
    ts = arange(start, stop)
    value = arange(start, stop)
    random.shuffle(value)

    return DataFrame({"timestamp": ts, "value": value})


def call(cmd):
    proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
    stdout, _ = proc.communicate()
    return stdout


for chunk_size in CHUNK_SIZES:
    df = create_df(0, SIZE)
    with timeit(f"chunk size {chunk_size}:"):
        schema = Schema(timestamp="timestamp*", value="float")
        repo = Repo("test-db")
        collection = repo.create_collection(schema, "test")
        series = collection / "test"
        for i in range(0, SIZE, chunk_size):
            series.write(df[i : i + chunk_size])
    res = call("du -hs test-db")
    print("Disk use", res.split()[0].decode())
    call("rm -r test-db")
Exemplo n.º 4
0
schema = Schema(key="int*", **{x: "float" for x in cols})
frm = {
    "key": range(SIZE),
}
for x in cols:
    frm[x] = sin(arange(SIZE))


# Simulate network lag
def lag(fn, delay):
    def wrapper(*a, **kw):
        sleep(delay)
        return fn(*a, **kw)

    return wrapper


mempod_write = MemPOD.write

for delay in (0.001, 0.01, 0.1):
    MemPOD.write = lag(mempod_write, delay)
    for threaded in (False, True):
        settings.threaded = threaded
        with timeit(f"{delay}-{threaded}"):
            repo = Repo()
            clc = repo.create_collection(schema, "clc")
            with clc.multi():
                for name in "ABC":
                    series = clc / name
                    series.write(frm)
Exemplo n.º 5
0
def read_lk():
    repo = Repo("test-db")
    collection = repo / "test"
    series = collection / "test"
    return series.frame()


def read_pg():
    conn = psycopg2.connect("postgresql:///test")
    cursor = conn.cursor()
    cursor.execute("select * from test")
    return list(cursor)


ts = pandas.date_range("1970-01-01", "2020-01-01", freq="5min")
value = sin(arange(len(ts)))
df = pandas.DataFrame({
    "timestamp": ts,
    "value": value,
})

with timeit("write pg"):
    write_pg(df)
with timeit("write lk"):
    write_lk(df)
with timeit("read pg"):
    read_pg()
with timeit("read lk"):
    read_lk()