예제 #1
0
        def image_to_hub(tf_dt, max_shape=None):
            dt = tf_dt.dtype.name
            if max_shape and len(max_shape) > len(tf_dt.shape):
                max_shape = max_shape[(len(max_shape) - len(tf_dt.shape)):]

            max_shape = max_shape or tuple(10000 if dim is None else dim
                                           for dim in tf_dt.shape)
            return Image(shape=tf_dt.shape, dtype=dt, max_shape=max_shape)
예제 #2
0
def test_dataset_filter_4():
    schema = {
        "img": Image((None, None, 3), max_shape=(100, 100, 3)),
        "cl": ClassLabel(names=["cat", "dog", "horse"]),
    }
    ds = Dataset("./data/tests/filtering_4", shape=(100,), schema=schema, mode="w")
    for i in range(100):
        ds["cl", i] = 0 if i < 10 else 1
        ds["img", i] = i * np.ones((5, 6, 3))
    ds_filtered = ds.filter(lambda x: x["cl"].compute() == 0)
    assert (ds_filtered[3:8, "cl"].compute() == np.zeros((5,))).all()
예제 #3
0
def test_dataset_batch_write():
    schema = {"image": Image(shape=(None, None, 3), max_shape=(100, 100, 3))}
    ds = Dataset("./data/batch", shape=(10,), mode="w", schema=schema)

    ds["image", 0:4] = 4 * np.ones((4, 67, 65, 3))

    assert (ds["image", 0].numpy() == 4 * np.ones((67, 65, 3))).all()
    assert (ds["image", 1].numpy() == 4 * np.ones((67, 65, 3))).all()
    assert (ds["image", 2].numpy() == 4 * np.ones((67, 65, 3))).all()
    assert (ds["image", 3].numpy() == 4 * np.ones((67, 65, 3))).all()

    ds["image", 5:7] = [2 * np.ones((60, 65, 3)), 3 * np.ones((54, 30, 3))]

    assert (ds["image", 5].numpy() == 2 * np.ones((60, 65, 3))).all()
    assert (ds["image", 6].numpy() == 3 * np.ones((54, 30, 3))).all()
예제 #4
0
def test_dataset_filter_3():
    schema = {
        "img": Image((None, None, 3), max_shape=(100, 100, 3)),
        "cl": ClassLabel(names=["cat", "dog", "horse"]),
    }
    ds = Dataset("./data/tests/filtering_3", shape=(100,), schema=schema, mode="w")
    for i in range(100):
        ds["cl", i] = 0 if i % 5 == 0 else 1
        ds["img", i] = i * np.ones((5, 6, 3))
    ds["cl", 4] = 2
    ds_filtered = ds.filter(lambda x: x["cl"].compute() == 0)
    assert ds_filtered.indexes == [5 * i for i in range(20)]
    ds_filtered_2 = ds.filter(lambda x: x["cl"].compute() == 2)
    assert (ds_filtered_2["img"].compute() == 4 * np.ones((1, 5, 6, 3))).all()
    for item in ds_filtered_2:
        assert (item["img"].compute() == 4 * np.ones((5, 6, 3))).all()
        assert item["cl"].compute() == 2
예제 #5
0
def test_commit_checkout_2():
    my_schema = {
        "abc": "uint32",
        "img": Image((1000, 1000, 3), dtype="uint16"),
    }
    ds = hub.Dataset("./data/test_versioning/eg_3",
                     shape=(100, ),
                     schema=my_schema,
                     mode="w")
    for i in range(100):
        ds["img", i] = i * np.ones((1000, 1000, 3))
    a = ds.commit("first")

    # chunk 7.0.0.0 gets rewritten
    ds["img", 21] = 2 * ds["img", 21].compute()

    # the rest part of the chunk stays intact
    assert (ds["img", 21].compute() == 2 * 21 * np.ones((1000, 1000, 3))).all()
    assert (ds["img", 22].compute() == 22 * np.ones((1000, 1000, 3))).all()
    assert (ds["img", 23].compute() == 23 * np.ones((1000, 1000, 3))).all()

    # other chunks are still accessed from original chunk, for eg chunk 11 that contains 35th sample has single copy
    assert (ds["img", 35].compute() == 35 * np.ones((1000, 1000, 3))).all()

    b = ds.commit("second")

    # going back to first commit
    ds.checkout(a)

    # sanity check
    assert (ds["img", 21].compute() == 21 * np.ones((1000, 1000, 3))).all()

    ds.checkout("another", create=True)

    ds["img", 21] = 3 * ds["img", 21].compute()
    assert (ds["img", 21].compute() == 3 * 21 * np.ones((1000, 1000, 3))).all(
    )  # and not 6 * 21 as it would have been, had we checked out from b

    ds.commit("first2")

    ds.checkout("master")
    assert (ds["img", 21].compute() == 2 * 21 * np.ones((1000, 1000, 3))).all()
    ds.log()
예제 #6
0
def test_multiprocessing(sample_size=200,
                         width=100,
                         channels=4,
                         dtype="uint8"):

    my_schema = {
        "image":
        Image(
            (width, width, channels),
            dtype,
            (width, width, channels),
            chunks=(sample_size // 20),
            compressor="LZ4",
        ),
    }

    with Timer("multiprocesing"):

        @hub.transform(schema=my_schema, scheduler="threaded", workers=4)
        def my_transform(x):

            a = np.random.random((width, width, channels))
            for i in range(100):
                a *= np.random.random((width, width, channels))

            return {
                "image": (np.ones(
                    (width, width, channels), dtype=dtype) * 255),
            }

        ds = hub.Dataset(
            "./data/test/test_pipeline_basic_4",
            mode="w",
            shape=(sample_size, ),
            schema=my_schema,
            cache=2 * 26,
        )

        ds_t = my_transform(ds).store("./data/test/test_pipeline_basic_4")

    assert (ds_t["image", :].compute() == 255).all()
예제 #7
0
def test_tensor_flattening():
    t = {
        "image": Image(shape=(300, 400, 3), dtype="uint8"),
        "label": Tensor(
            shape=(5000,),
            dtype="<U20",
        ),
        "gradient": {
            "x": "int32",
            "y": "int32",
        },
    }
    result = tuple(flatten(t))
    paths = [r[1] for r in result]
    dtypes = [r[0] for r in result]

    assert paths == ["/image", "/label", "/gradient/x", "/gradient/y"]
    assert isinstance(dtypes[0], Image)
    assert isinstance(dtypes[1], Tensor)
    assert isinstance(dtypes[2], Primitive)
    assert isinstance(dtypes[3], Primitive)
예제 #8
0
def test_dynamic_version_control():
    my_schema = {"img": Image((None, None, 3), max_shape=(1000, 1000, 3))}
    ds = hub.Dataset("./data/dynamic_versioning",
                     shape=(10, ),
                     schema=my_schema,
                     mode="w")
    for i in range(10):
        ds["img", i] = i * np.ones((100, 100, 3))

    a = ds.commit("first")
    for i in range(10):
        ds["img", i] = 2 * i * np.ones((150, 150, 3))
    ds.checkout(a)

    for i in range(10):
        assert (ds["img", i].compute() == i * np.ones((100, 100, 3))).all()

    ds.checkout("master")

    for i in range(10):
        assert (ds["img", i].compute() == 2 * i * np.ones((150, 150, 3))).all()
예제 #9
0
def main():
    with Timer("Time"):
        schema = {
            "image":
            Image(
                (None, None, 4),
                dtype="uint8",
                chunks=(1, 2048, 2048, 4),
                max_shape=(100000, 100000, 4),
            )
        }
        ds = hub.Dataset("./data/examples/big_image",
                         mode="w",
                         schema=schema,
                         shape=(10000, ))

        print(ds["image"].shape, ds["image"].dtype)

        ds["image", 3, 0:2048, 0:2048] = np.ones(
            (2048, 2048, 4), dtype="uint8")  # single chunk read/write
        print(ds._tensors["/image"].get_shape((3, )))
        ds.commit()
예제 #10
0
def main():
    schema = {
        "image": Image(shape=(None, None), max_shape=(28, 28)),
        "label": ClassLabel(num_classes=10),
    }
    path = "./data/examples/new_api_intro2"

    ds = Dataset(path, shape=(10, ), mode="w", schema=schema)
    print(len(ds))
    for i in range(len(ds)):
        with Timer("writing single element"):
            ds["image", i] = np.ones((28, 28), dtype="uint8")
            ds["label", i] = 3

    ds.resize_shape(200)
    print(ds.shape)
    print(ds["label", 100:110].numpy())
    with Timer("Committing"):
        ds.flush()

    ds = Dataset(path)
    print(ds.schema)
    print(ds["image", 0].compute())
예제 #11
0
def test_dataset_batch_write_2():
    schema = {"image": Image(shape=(None, None, 3), max_shape=(640, 640, 3))}
    ds = Dataset("./data/batch", shape=(100,), mode="w", schema=schema)

    ds["image", 0:14] = [np.ones((640 - i, 640, 3)) for i in range(14)]
예제 #12
0
def test_dataset_filter_2():
    my_schema = {
        "fname": Text((None,), max_shape=(10,)),
        "lname": Text((None,), max_shape=(10,)),
    }
    ds = Dataset("./data/tests/filtering", shape=(100,), schema=my_schema, mode="w")
    for i in range(100):
        ds["fname", i] = "John"
        ds["lname", i] = "Doe"

    for i in [1, 3, 6, 15, 63, 96, 75]:
        ds["fname", i] = "Active"

    for i in [15, 31, 25, 75, 3, 6]:
        ds["lname", i] = "loop"

    dsv_combined = ds.filter(
        lambda x: x["fname"].compute() == "Active" and x["lname"].compute() == "loop"
    )
    tsv_combined_fname = dsv_combined["fname"]
    tsv_combined_lname = dsv_combined["lname"]
    for item in dsv_combined:
        assert item.compute() == {"fname": "Active", "lname": "loop"}
    for item in tsv_combined_fname:
        assert item.compute() == "Active"
    for item in tsv_combined_lname:
        assert item.compute() == "loop"
    dsv_1 = ds.filter(lambda x: x["fname"].compute() == "Active")
    dsv_2 = dsv_1.filter(lambda x: x["lname"].compute() == "loop")
    for item in dsv_1:
        assert item.compute()["fname"] == "Active"
    tsv_1 = dsv_1["fname"]
    tsv_2 = dsv_2["lname"]
    for item in tsv_1:
        assert item.compute() == "Active"
    for item in tsv_2:
        assert item.compute() == "loop"
    for item in dsv_2:
        assert item.compute() == {"fname": "Active", "lname": "loop"}
    assert dsv_combined.indexes == [3, 6, 15, 75]
    assert dsv_1.indexes == [1, 3, 6, 15, 63, 75, 96]
    assert dsv_2.indexes == [3, 6, 15, 75]

    dsv_3 = ds.filter(lambda x: x["lname"].compute() == "loop")
    dsv_4 = dsv_3.filter(lambda x: x["fname"].compute() == "Active")
    for item in dsv_3:
        assert item.compute()["lname"] == "loop"
    for item in dsv_4:
        assert item.compute() == {"fname": "Active", "lname": "loop"}
    assert dsv_3.indexes == [3, 6, 15, 25, 31, 75]
    assert dsv_4.indexes == [3, 6, 15, 75]

    my_schema2 = {
        "fname": Text((None,), max_shape=(10,)),
        "lname": Text((None,), max_shape=(10,)),
        "image": Image((1920, 1080, 3)),
    }
    ds = Dataset("./data/tests/filtering2", shape=(100,), schema=my_schema2, mode="w")
    with pytest.raises(KeyError):
        ds.filter(lambda x: (x["random"].compute() == np.ones((1920, 1080, 3))).all())

    for i in [1, 3, 6, 15, 63, 96, 75]:
        ds["fname", i] = "Active"
    dsv = ds.filter(lambda x: x["fname"].compute() == "Active")
    with pytest.raises(KeyError):
        dsv.filter(lambda x: (x["random"].compute() == np.ones((1920, 1080, 3))).all())
import numpy as np

import hub
from hub.schema import Image, ClassLabel
from hub.utils import Timer

schema = {
    "image": Image((28, 28), chunks=(1000, 28, 28)),
    "label": ClassLabel(num_classes=10),
}


def main():
    sample_count = 70000
    step = 10
    with Timer("Time"):

        ds = hub.Dataset(
            "./data/examples/mnist_upload_speed_benchmark",
            mode="w",
            schema=schema,
            shape=(sample_count, ),
            cache=2**26,
        )

        arr = (np.random.rand(step, 28, 28) * 100).astype("uint8")

        for i in range(0, sample_count, step):
            # with Timer(f"Sample {i}"):
            ds["image", i:i + step] = arr
예제 #14
0
import hub
from hub.schema import Image
import numpy as np

my_schema = {
    "abc": "uint32",
    "img": Image((1000, 1000, 3), dtype="uint16"),
}
ds = hub.Dataset("./data/test_versioning/eg_3",
                 shape=(100, ),
                 schema=my_schema,
                 mode="w")
for i in range(100):
    ds["img", i] = i * np.ones((1000, 1000, 3))
a = ds.commit("first")

# chunk 7.0.0.0 gets rewritten
ds["img", 21] = 2 * ds["img", 21].compute()

# the rest part of the chunk stays intact
assert (ds["img", 21].compute() == 2 * 21 * np.ones((1000, 1000, 3))).all()
assert (ds["img", 22].compute() == 22 * np.ones((1000, 1000, 3))).all()
assert (ds["img", 23].compute() == 23 * np.ones((1000, 1000, 3))).all()

# other chunks are still accessed from original chunk, for eg chunk 11 that contains 35th sample has single copy
assert (ds["img", 35].compute() == 35 * np.ones((1000, 1000, 3))).all()

b = ds.commit("second")

# going back to first commit
ds.checkout(a)