def test_hourly_partitions():
    nw = BatchWriter(
        inner_writer=NullWriter,
        dataset="bucket/path",
        partitions=["year_{yyyy}/month_{mm}/day_{dd}/by_hour/hour={HH}"],
    )
    for i in range(1):
        nw.append({"@": [" "] * BLOB_SIZE})
    res = nw.finalize()
    assert "by_hour/hour=" in res
def do_writer_default():
    w = BatchWriter(inner_writer=DiskWriter, dataset="_temp")
    for i in range(int(1e5)):
        w.append({"Barney Stinson": "Lorenzo Von Matterhorn"})
        w.append({"Laszlo Cravensworth": "Jackie Daytona"})
    w.finalize()
    del w
def do_writer_compressed(algo):
    w = BatchWriter(inner_writer=DiskWriter, dataset="_temp", format=algo)
    for i in range(int(1e5)):
        w.append({"test": True})
        w.append({"test": False})
    w.finalize()
    del w
def do_writer_abs():
    w = BatchWriter(
        inner_writer=DiskWriter,
        dataset=os.getcwd() + "/_temp",
        date=datetime.date.today(),
    )
    for i in range(int(1e5)):
        w.append({"Barney Stinson": "Lorenzo Von Matterhorn"})
        w.append({"Laszlo Cravensworth": "Jackie Daytona"})
    w.finalize()
示例#5
0
def test_index():
    # step back through time
    shutil.rmtree("_temp/data/tweets", ignore_errors=True)

    r = Reader(inner_reader=DiskReader,
               dataset="tests/data/tweets",
               raw_path=True)
    w = BatchWriter(inner_writer=DiskWriter,
                    dataset="_temp/data/tweets",
                    index_on=["username"])
    for item in r:
        w.append(item)
    w.finalize()
    index = glob.glob("_temp/data/tweets/**/*username.idx", recursive=True)
    assert len(index) == 1, index

    with open(index[0], "rb") as f:
        idx = f.read()

    # test the recently created index outside the reader
    i = Index(io.BytesIO(idx))
    assert i.search("SwiftOnSecurity") == []
    assert i.search("BBCNews") == [1, 2, 4, 24, 25, 44], i.search("BBCNews")

    # test the filter with an index
    ri = Reader(
        inner_reader=DiskReader,
        dataset="_temp/data/tweets",
        filters="username = '******'",
    )
    ri = list(ri)

    assert len(ri) == 6
def test_disk_text():

    try:

        w = BatchWriter(
            inner_writer=DiskWriter,
            blob_size=1024,
            format="jsonl",
            dataset=f"_temp/test/gcs/dataset/text",
        )
        for i in range(250):
            w.append({"index": i + 300})
        w.finalize()

        # read the files we've just written, we should be able to
        # read over both paritions.
        r = Reader(
            inner_reader=DiskReader,
            dataset=f"_temp/test/gcs/dataset/text",
        )
        l = list(r)

        assert len(l) == 250, len(l)
    except Exception as e:  # pragma: no cover
        raise e
示例#7
0
def do_writer():
    w = BatchWriter(inner_writer=FileWriter,
                    dataset="tests/data/framed",
                    date=datetime.date.today())
    for i in range(int(1e5)):
        w.append({"test": 2})
    w.finalize()
示例#8
0
def test_gcs_text():

    # set up
    set_up()

    w = BatchWriter(
        inner_writer=GoogleCloudStorageWriter,
        project="testing",
        blob_size=1024,
        format="jsonl",
        dataset=f"{BUCKET_NAME}/test/gcs/dataset/text",
    )
    for i in range(250):
        w.append({"index": i + 300})
    w.finalize()

    # read the files we've just written, we should be able to
    # read over both paritions.
    r = Reader(
        inner_reader=GoogleCloudStorageReader,
        project="testing",
        dataset=f"{BUCKET_NAME}/test/gcs/dataset/text",
        persistence=STORAGE_CLASS.MEMORY,
    )

    assert r.count() == 250, r
示例#9
0
def test_gcs_parquet():

    try:
        # set up the stub
        set_up()

        w = BatchWriter(
            inner_writer=GoogleCloudStorageWriter,
            project="testing",
            format="parquet",
            dataset=f"{BUCKET_NAME}/test/gcs/dataset",
        )
        for i in range(100):
            w.append({"$$": i * 300})
        w.finalize()

        # read the files we've just written, we should be able to
        # read over both paritions.
        r = Reader(
            inner_reader=GoogleCloudStorageReader,
            project="testing",
            dataset=f"{BUCKET_NAME}/test/gcs/dataset",
        )
        l = list(r)
        assert isinstance(l[0], dict)
        assert len(l) == 100, len(l)
    except Exception as e:  # pragma: no cover
        raise e
示例#10
0
def execute_test(compress, schema, reader):

    # reader = read_jsonl('tweets.jsonl')

    res = []
    for i in range(10):
        writer = BatchWriter(
            inner_writer=NullWriter,
            dataset="_tests/{datefolders}",
            format=compress,
            schema=schema,
            metadata={"test_data": True},
        )
        start = time.perf_counter_ns()
        for record in reader:
            writer.append(record)
        writer.finalize()
        res.append((time.perf_counter_ns() - start) / 1e9)
    return statistics.mean(res)
示例#11
0
def test_using_batch_writer():

    errored = False
    # try:
    if True:
        _create_bucket()
        w = BatchWriter(
            inner_writer=MinIoWriter,
            end_point=os.getenv("MINIO_END_POINT"),
            access_key=os.getenv("MINIO_ACCESS_KEY"),
            secret_key=os.getenv("MINIO_SECRET_KEY"),
            secure=False,
            dataset=f"{BUCKET_NAME}/test_writer",
        )

        for member in VAMPIRIC_COUNCIL:
            w.append(member)
        w.finalize()
    # except Exception as a:
    #    print(a)
    #    errored = True

    assert not errored
示例#12
0
        while len(chunk) > 0:
            chunk = f.read(chunk_size)
            augmented_chunk = carry_forward + chunk
            lines = augmented_chunk.split(delimiter)
            carry_forward = lines.pop()
            yield from lines
        if carry_forward:
            yield carry_forward


schema = Schema(schema_definition)
lines = read_jsonl("tests/data/index/not/tweets.jsonl")

writer = BatchWriter(
    inner_writer=DiskWriter,
    dataset="_temp/idx",
    # schema=schema,
    indexes=["user_name"],
)

for record in lines:
    writer.append(record)
writer.finalize()

reader = Reader(inner_reader=DiskReader,
                dataset="_temp/idx",
                filters=("user_name", "==", "Remy"))
i = 0
for i, r in enumerate(reader):
    print(i, r)

print(i)