示例#1
0
def test_append_dataset():
    dt = {"first": Tensor(shape=(250, 300)), "second": "float"}
    url = "./data/test/model"
    ds = Dataset(schema=dt, shape=(100, ), url=url, mode="w")
    ds.append_shape(20)
    ds["first"][0] = np.ones((250, 300))

    assert len(ds) == 120
    assert ds["first"].shape[0] == 120
    assert ds["first", 5:10].shape[0] == 5
    assert ds["second"].shape[0] == 120
    ds.commit()

    ds = Dataset(url)
    assert ds["first"].shape[0] == 120
    assert ds["first", 5:10].shape[0] == 5
    assert ds["second"].shape[0] == 120
示例#2
0
def test_dataset_3():
    dt = {
        "first": Tensor(shape=(2, )),
        "second": "float",
        "text": Text(shape=(None, ), max_shape=(12, )),
    }
    ds = Dataset("./data/test/ds_3/", schema=dt, shape=(9, ), mode="w")
    with pytest.raises(ValueError):
        ds[3, 8] = np.ones((3, 5))

    with pytest.raises(KeyError):
        ds["abc"] = np.ones((3, 5))
    ds["second"] = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8])
    for i in range(9):
        assert ds[i, "second"].compute() == i
    with pytest.raises(ValueError):
        ds[3, 8].compute()
示例#3
0
def get_dataset_from_hub(samples=1, read_from_fs=False, pytorch=False):
    """
    Build dataset and transform to pytorch or tensorflow
    """
    my_schema = {"img": Tensor(shape=(3, 256, 256)), "label": "uint8"}
    if not read_from_fs:
        ds = hub.Dataset("test/benchmarking", shape=(samples,), schema=my_schema)
    else:
        ds = hub.Dataset(
            "s3://snark-test/benchmarking_test", shape=(samples,), schema=my_schema
        )
    for i in range(samples):
        ds["img", i] = np.random.rand(3, 256, 256)
        ds["label", i] = 0

    ds = ds.to_pytorch() if pytorch else ds.to_tensorflow()
    return ds
示例#4
0
def test_minio_endpoint():
    token = {
        "aws_access_key_id": os.getenv("ACTIVELOOP_MINIO_KEY"),
        "aws_secret_access_key": os.getenv("ACTIVELOOP_MINIO_SECRET_ACCESS_KEY"),
        "endpoint_url": "https://play.min.io:9000",
        "region": "us-east-1",
    }

    schema = {"abc": Tensor((100, 100, 3))}
    ds = Dataset(
        "s3://bucket/random_dataset", token=token, shape=(10,), schema=schema, mode="w"
    )

    for i in range(10):
        ds["abc", i] = i * np.ones((100, 100, 3))
    ds.flush()
    for i in range(10):
        assert (ds["abc", i].compute() == i * np.ones((100, 100, 3))).all()
示例#5
0
def test_tensorview_dynamicshapes(url="./data/test/dataset", token=None):
    my_schema = {
        "image": {
            "a": {
                "c":
                Tensor(
                    shape=(None, None, None),
                    dtype="uint8",
                    max_shape=(10000, 10000, 10000),
                ),
            }
        }
    }
    ds = Dataset(url, token=token, shape=(100, ), mode="w", schema=my_schema)
    ds[0:1]["image", "a", "c", 0] = np.ones((7, 10, 20))
    ds[2:3]["image", "a", "c", 0] = 2 * np.ones((7, 10, 20))

    print(ds["image", "a", "c", 2].compute())
示例#6
0
def test_tensorview_shapes_1(url="./data/test/dataset", token=None):
    my_schema = {
        "image":
        Tensor((None, None, None, None),
               "uint8",
               max_shape=(10, 1920, 1080, 4)),
        "label":
        float,
    }
    ds = Dataset(url, token=token, shape=(100, ), mode="w", schema=my_schema)
    ds["image", 1] = np.ones((8, 345, 75, 2))
    ds["image", 2] = np.ones((5, 345, 90, 3))
    assert ds["image", 1:3, 2:4, 300:330].shape.tolist() == [
        [2, 30, 75, 2],
        [2, 30, 90, 3],
    ]
    assert ds["image", 0].shape.tolist() == [0, 0, 0, 0]
    assert ds["label", 5:50].shape.tolist() == [45]
示例#7
0
def test_datasetview_filter():
    def abc_filter(sample):
        return sample["ab"].compute().startswith("abc")

    my_schema = {
        "img": Tensor((100, 100)),
        "ab": Text((None, ), max_shape=(10, ))
    }
    ds = Dataset("./data/new_filter_2", shape=(10, ), schema=my_schema)
    for i in range(10):
        ds["img", i] = i * np.ones((100, 100))
        ds["ab", i] = "abc" + str(i) if i % 2 == 0 else "def" + str(i)
    dsv = ds[2:7]
    ds2 = dsv.filter(abc_filter)
    assert ds2.indexes == [2, 4, 6]
    dsv2 = ds[2]
    ds3 = dsv2.filter(abc_filter)
    assert ds3.indexes == 2
示例#8
0
def test_dataset_lazy():
    dt = {
        "first": Tensor(shape=(2,)),
        "second": "float",
        "text": Text(shape=(None,), max_shape=(12,)),
    }
    url = "./data/test/ds_lazy"
    ds = Dataset(schema=dt, shape=(2,), url=url, mode="w")
    ds["text", 1] = "hello world"
    ds["second", 0] = 3.14
    ds["first", 0] = np.array([5, 6])
    ds.disable_lazy()
    assert ds["text", 1] == "hello world"
    assert ds["second", 0] == 3.14
    assert (ds["first", 0] == np.array([5, 6])).all()
    ds.enable_lazy()
    assert ds["text", 1].compute() == "hello world"
    assert ds["second", 0].compute() == 3.14
    assert (ds["first", 0].compute() == np.array([5, 6])).all()
示例#9
0
def test_dataset_compute():
    dt = {
        "first": Tensor(shape=(2,)),
        "second": "float",
        "text": Text(shape=(None,), max_shape=(12,)),
    }
    url = "./data/test/ds_compute"
    ds = Dataset(schema=dt, shape=(2,), url=url, mode="w")
    ds["text", 1] = "hello world"
    ds["second", 0] = 3.14
    ds["first", 0] = np.array([5, 6])
    comp = ds.compute()
    comp0 = comp[0]
    assert (comp0["first"] == np.array([5, 6])).all()
    assert comp0["second"] == 3.14
    assert comp0["text"] == ""
    comp1 = comp[1]
    assert (comp1["first"] == np.array([0, 0])).all()
    assert comp1["second"] == 0
    assert comp1["text"] == "hello world"
示例#10
0
def test_tensor_flattening():
    t = {
        "image": Image(shape=(300, 400, 3), dtype="uint8"),
        "label": Tensor(
            shape=(5000,),
            dtype="<U20",
        ),
        "gradient": {
            "x": "int32",
            "y": "int32",
        },
    }
    result = tuple(flatten(t))
    paths = [r[1] for r in result]
    dtypes = [r[0] for r in result]

    assert paths == ["/image", "/label", "/gradient/x", "/gradient/y"]
    assert isinstance(dtypes[0], Image)
    assert isinstance(dtypes[1], Tensor)
    assert isinstance(dtypes[2], Primitive)
    assert isinstance(dtypes[3], Primitive)
示例#11
0
def test_pickleability(url="./data/test/test_dataset_dynamic_shaped"):
    schema = {
        "first": Tensor(
            shape=(None, None),
            dtype="int32",
            max_shape=(100, 100),
            chunks=(100,),
        )
    }
    ds = Dataset(
        url=url,
        token=None,
        shape=(1000,),
        mode="w",
        schema=schema,
    )

    ds["first"][0] = np.ones((10, 10))

    pickled_ds = cloudpickle.dumps(ds)
    new_ds = pickle.loads(pickled_ds)
    assert np.all(new_ds["first"][0].compute() == ds["first"][0].compute())
示例#12
0
def test_dataset_store():
    my_schema = {"image": Tensor((100, 100), "uint8"), "abc": "uint8"}

    ds = Dataset("./test/ds_store", schema=my_schema, shape=(100,))
    for i in range(100):
        ds["image", i] = i * np.ones((100, 100))
        ds["abc", i] = i

    def my_filter(sample):
        return sample["abc"].compute() % 5 == 0

    dsv = ds.filter(my_filter)

    ds2 = ds.store("./test/ds2_store")
    for i in range(100):
        assert (ds2["image", i].compute() == i * np.ones((100, 100))).all()
        assert ds["abc", i].compute() == i

    ds3 = dsv.store("./test/ds3_store")
    for i in range(20):
        assert (ds3["image", i].compute() == 5 * i * np.ones((100, 100))).all()
        assert ds3["abc", i].compute() == 5 * i
示例#13
0
def test_dataset_casting():
    my_schema = {
        "a": Tensor(shape=(1, ), dtype="float64"),
    }

    @transform(schema=my_schema)
    def my_transform(annotation):
        return {
            "a": 2.4,
        }

    out_ds = my_transform(range(100))
    res_ds = out_ds.store("./data/casting")
    assert res_ds["a", 30].compute() == np.array([2.4])

    ds = Dataset(schema=my_schema, url="./data/casting2", shape=(100, ))
    for i in range(100):
        ds["a", i] = 0.2
    assert ds["a", 30].compute() == np.array([0.2])

    ds2 = Dataset(schema=my_schema, url="./data/casting3", shape=(100, ))
    ds2["a", 0:100] = np.ones(100, )
    assert ds2["a", 30].compute() == np.array([1])
示例#14
0
def test_dataset_dynamic_shaped():
    schema = {
        "first": Tensor(
            shape=(None, None),
            dtype="int32",
            max_shape=(100, 100),
            chunks=(100,),
        )
    }
    ds = Dataset(
        "./data/test/test_dataset_dynamic_shaped",
        token=None,
        shape=(1000,),
        mode="w",
        schema=schema,
    )

    ds["first", 50, 50:60, 50:60] = np.ones((10, 10), "int32")
    assert (ds["first", 50, 50:60, 50:60].numpy() == np.ones((10, 10), "int32")).all()

    ds["first", 0, :10, :10] = np.ones((10, 10), "int32")
    ds["first", 0, 10:20, 10:20] = 5 * np.ones((10, 10), "int32")
    assert (ds["first", 0, 0:10, 0:10].numpy() == np.ones((10, 10), "int32")).all()
示例#15
0
def test_dataset_dynamic_shaped_slicing():
    schema = {
        "first": Tensor(
            shape=(None, None),
            dtype="int32",
            max_shape=(100, 100),
            chunks=(100,),
        )
    }
    ds = Dataset(
        "./data/test/test_dataset_dynamic_shaped",
        token=None,
        shape=(100,),
        mode="w",
        schema=schema,
    )

    for i in range(100):
        ds["first", i] = i * np.ones((i, i))
    items = ds["first", 0:100].compute()
    for i in range(100):
        assert (items[i] == i * np.ones((i, i))).all()

    assert (ds["first", 1:2].compute()[0] == np.ones((1, 1))).all()
示例#16
0
"""
License:
This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
If a copy of the MPL was not distributed with this file, You can obtain one at https://mozilla.org/MPL/2.0/.
"""

import numpy as np

import hub
from hub.schema import Tensor

schema = {
    "image": Tensor((10, 1920, 1080, 3), "uint8"),
    "label": {
        "a": Tensor((100, 200), "int32"),
        "b": Tensor((100, 400), "int64"),
    },
}


def test_hub_open():
    ds = hub.Dataset("./data/test/hub_open",
                     token=None,
                     shape=(10000, ),
                     mode="w",
                     schema=schema)
    ds["label/a", 5, 50, 50] = 9
    assert ds["label/a", 5, 50, 50].numpy() == 9
    ds["image", 5, 4, 120:200, 150:300, :] = 3 * np.ones((80, 150, 3), "uint8")
    assert (ds["image", 5, 4, 120:200, 150:300, :].numpy() == 3 * np.ones(
        (80, 150, 3), "uint8")).all()
示例#17
0
from hub.exceptions import DirectoryNotEmptyException
from hub.schema import BBox, ClassLabel, Image, SchemaDict, Sequence, Tensor, Text
from hub.schema.class_label import ClassLabel
from hub.utils import (
    azure_creds_exist,
    gcp_creds_exist,
    hub_creds_exist,
    minio_creds_exist,
    s3_creds_exist,
    transformers_loaded,
)

Dataset = dataset.Dataset

my_schema = {
    "image": Tensor((10, 1920, 1080, 3), "uint8"),
    "label": {
        "a": Tensor((100, 200), "int32", compressor="lz4"),
        "b": Tensor((100, 400), "int64", compressor="zstd"),
        "c": Tensor((5, 3), "uint8"),
        "d": {"e": Tensor((5, 3), "uint8")},
    },
}


def test_dataset_2():
    dt = {"first": "float", "second": "float"}
    ds = Dataset(schema=dt, shape=(2,), url="./data/test/test_dataset2", mode="w")
    ds.meta_information["description"] = "This is my description"

    ds["first"][0] = 2.3
示例#18
0
def test_tensor_error():
    try:
        Tensor(None, max_shape=None)
    except TypeError as ex:
        assert "shape cannot be None" in str(ex)
示例#19
0
def test_dataset_bug_2(url="./data/test/dataset", token=None):
    my_schema = {
        "image": Tensor((100, 100), "uint8"),
    }
    ds = Dataset(url, token=token, shape=(10000,), mode="w", schema=my_schema)
    ds["image", 0:1] = [np.zeros((100, 100))]
示例#20
0
def test_tensor_error():
    try:
        Tensor(None, max_shape=None)
    except TypeError as ex:
        assert "both shape and max_shape cannot be None at the same time" in str(ex)
示例#21
0
def test_tensor_repr():
    tensor_object = Tensor()
    tensor_object_2 = Tensor(shape=(5000,), dtype="<U20")
    assert tensor_object.__repr__() == "Tensor(shape=(None,), dtype='float64')"
    assert tensor_object_2.__repr__() == "Tensor(shape=(5000,), dtype='<U20')"
示例#22
0
def test_tensor_init():
    with pytest.raises(ValueError):
        Tensor(shape=2, max_shape=(2, 2))
示例#23
0
from hub import Dataset
from hub.api.datasetview import TensorView
from hub.exceptions import NoneValueException
from hub.schema import Tensor

import numpy as np
import pytest

my_schema = {
    "image":
    Tensor((None, None, None, None), "uint8", max_shape=(10, 1920, 1080, 4)),
    "label":
    float,
}

ds = Dataset("./data/test/dataset", shape=(100, ), mode="w", schema=my_schema)


def test_tensorview_init():
    with pytest.raises(NoneValueException):
        tensorview_object = TensorView(ds, subpath=None)
    with pytest.raises(NoneValueException):
        tensorview_object_2 = TensorView(dataset=None, subpath="image")


def test_tensorview_getitem():
    images_tensorview = ds["image"]
    with pytest.raises(IndexError):
        images_tensorview["7", 0:1920, 0:1080, 0:3].compute()

示例#24
0
def test_dataset_change_schema():
    schema = {
        "abc": "uint8",
        "def": {
            "ghi": Tensor((100, 100)),
            "rst": Tensor((100, 100, 100)),
        },
    }
    ds = Dataset("./data/test_schema_change", schema=schema, shape=(100, ))
    new_schema_1 = {
        "abc": "uint8",
        "def": {
            "ghi": Tensor((200, 100)),
            "rst": Tensor((100, 100, 100)),
        },
    }
    new_schema_2 = {
        "abrs": "uint8",
        "def": {
            "ghi": Tensor((100, 100)),
            "rst": Tensor((100, 100, 100)),
        },
    }
    new_schema_3 = {
        "abc": "uint8",
        "def": {
            "ghijk": Tensor((100, 100)),
            "rst": Tensor((100, 100, 100)),
        },
    }
    new_schema_4 = {
        "abc": "uint16",
        "def": {
            "ghi": Tensor((100, 100)),
            "rst": Tensor((100, 100, 100)),
        },
    }
    new_schema_5 = {
        "abc": "uint8",
        "def": {
            "ghi": Tensor((100, 100, 3)),
            "rst": Tensor((100, 100, 100)),
        },
    }
    with pytest.raises(SchemaMismatchException):
        ds = Dataset("./data/test_schema_change",
                     schema=new_schema_1,
                     shape=(100, ))
    with pytest.raises(SchemaMismatchException):
        ds = Dataset("./data/test_schema_change",
                     schema=new_schema_2,
                     shape=(100, ))
    with pytest.raises(SchemaMismatchException):
        ds = Dataset("./data/test_schema_change",
                     schema=new_schema_3,
                     shape=(100, ))
    with pytest.raises(SchemaMismatchException):
        ds = Dataset("./data/test_schema_change",
                     schema=new_schema_4,
                     shape=(100, ))
    with pytest.raises(SchemaMismatchException):
        ds = Dataset("./data/test_schema_change",
                     schema=new_schema_5,
                     shape=(100, ))
示例#25
0
import numpy as np
import zarr

import hub
from hub.schema import Tensor, Image, Text
from hub.utils import Timer

my_schema = {
    "image": Tensor((28, 28, 4), "int32", (28, 28, 4)),
    "label": Text((None, ), "int64", (20, )),
    "confidence": "float",
}

dynamic_schema = {
    "image": Tensor(shape=(None, None, None),
                    dtype="int32",
                    max_shape=(32, 32, 3)),
    "label": Text((None, ), "int64", (20, )),
}


def test_pipeline_basic():
    ds = hub.Dataset("./data/test/test_pipeline_basic",
                     mode="w",
                     shape=(100, ),
                     schema=my_schema)

    for i in range(len(ds)):
        ds["image", i] = np.ones((28, 28, 4), dtype="int32")
        ds["label", i] = f"hello {i}"
        ds["confidence", i] = 0.2
示例#26
0
def benchmark(sample_size=100, width=1000, channels=4, dtype="int8"):
    numpy_arr = np.zeros((sample_size, width, width, channels), dtype=dtype)
    zarr_fs = zarr.zeros(
        (sample_size, width, width, channels),
        dtype=dtype,
        store=zarr.storage.FSStore("./data/test/array"),
        overwrite=True,
    )
    zarr_lmdb = zarr.zeros(
        (sample_size, width, width, channels),
        dtype=dtype,
        store=zarr.storage.LMDBStore("./data/test/array2"),
        overwrite=True,
    )

    my_schema = {
        "image": Tensor((width, width, channels), dtype,
                        (width, width, channels)),
    }

    ds_fs = hub.Dataset(
        "./data/test/test_pipeline_basic_3",
        mode="w",
        shape=(sample_size, ),
        schema=my_schema,
        cache=0,
    )

    ds_fs_cache = hub.Dataset(
        "./data/test/test_pipeline_basic_2",
        mode="w",
        shape=(sample_size, ),
        schema=my_schema,
    )
    if False:
        print(
            f"~~~ Sequential write of {sample_size}x{width}x{width}x{channels} random arrays ~~~"
        )
        for name, arr in [
            ("Numpy", numpy_arr),
            ("Zarr FS", zarr_fs),
            ("Zarr LMDB", zarr_lmdb),
            ("Hub FS", ds_fs["image"]),
            ("Hub FS+Cache", ds_fs_cache["image"]),
        ]:
            with Timer(name):
                for i in range(sample_size):
                    arr[i] = (np.random.rand(width, width, channels) *
                              255).astype(dtype)

    print(
        f"~~~ Pipeline {sample_size}x{width}x{width}x{channels} random arrays ~~~"
    )
    for name, processes in [
        ("single", 1),
        ("processed", 10),
    ]:  # , ("ray", 10), ("green", 10), ("dask", 10)]:

        @hub.transform(schema=my_schema, scheduler=name, processes=processes)
        def my_transform(sample):
            return {
                "image":
                (np.random.rand(width, width, channels) * 255).astype(dtype),
            }

        with Timer(name):
            out_ds = my_transform(ds_fs)
            out_ds.store(f"./data/test/test_pipeline_basic_output_{name}")
示例#27
0
def test_dataset_no_shape(url="./data/test/dataset", token=None):
    try:
        Tensor(shape=(120, 120, 3), max_shape=(120, 120, 4))
    except ValueError:
        pass
示例#28
0
def test_objectview():
    schema = SchemaDict({
        "a":
        Tensor((20, 20), dtype=int, max_shape=(20, 20)),
        "b":
        Sequence(dtype=BBox(dtype=float)),
        "c":
        Sequence(dtype=SchemaDict(
            {"d": Sequence((), dtype=Tensor((5, 5), dtype=float))})),
        "e":
        Sequence(dtype={
            "f": {
                "g": Tensor(5, dtype=int),
                "h": Tensor((), dtype=int)
            }
        }),
    })
    ds = hub.Dataset("./nested_seq", shape=(5, ), mode="w", schema=schema)

    # dataset view to objectview
    dv = ds[3:5]
    dv["c", 0] = {"d": 5 * np.ones((2, 2, 5, 5))}
    assert (dv[0, "c", 0, "d", 0].compute() == 5 * np.ones((5, 5))).all()

    # dataset view unsqueezed
    with pytest.raises(IndexError):
        dv["c", "d"].compute()

    # dataset unsqueezed
    with pytest.raises(IndexError):
        ds["c", "d"].compute()

    # tensorview to object view
    # sequence of tensor
    ds["b", 0] = 0.5 * np.ones((5, 4))
    tv = ds["b", 0]
    tv[0] = 0.3 * np.ones((4, ))
    assert (tv[0].compute() == 0.3 * np.ones((4, ))).all()

    # ds to object view
    assert (ds[3, "c", "d"].compute() == 5 * np.ones((2, 2, 5, 5))).all()

    # Sequence of schemadicts
    ds[0, "e"] = {"f": {"g": np.ones((3, 5)), "h": np.array([42, 25, 15])}}
    with pytest.raises(KeyError):
        ds[0, "e", 1].compute()
    assert (ds[0, "e", "f", "h"].compute() == np.array([42, 25, 15])).all()

    # With dataset view
    dv[0, "e"] = {"f": {"g": np.ones((3, 5)), "h": np.array([1, 25, 1])}}
    # dv[0, "e", 1]["f", "h"] = 25
    assert (dv[0, "e", "f", "h"].compute() == np.array([1, 25, 1])).all()

    # If not lazy mode all slices should be stable
    ds.lazy = False
    assert ds[0, "e", 0, "f", "h"] == 42
    with pytest.raises(KeyError):
        ds[0, "e", 1]["f", "h"] == 25
    ds.lazy = True

    # make an objectview
    ov = ds["c", "d"]
    with pytest.raises(IndexError):
        ov.compute()
    assert (ov[3].compute() == 5 * np.ones((2, 2, 5, 5))).all()
    # ov[3, 1] = 2 * np.ones((2, 5, 5))
    assert (ov[3][0, 0].compute() == 5 * np.ones((5, 5))).all()
    assert (ov[3][1].compute() == 5 * np.ones((2, 5, 5))).all()
示例#29
0
def test_tensor_repr():
    tensor_object_2 = Tensor(shape=(5000,), dtype="<U20")
    assert tensor_object_2.__repr__() == "Tensor(shape=(5000,), dtype='<U20')"
示例#30
0
def test_tensor_error_2():
    with pytest.raises(TypeError):
        t1 = Tensor(shape=(5.1))
    with pytest.raises(TypeError):
        t2 = Tensor(shape=(5.1, ))
    with pytest.raises(TypeError):
        t3 = Tensor(shape=(5, 6), max_shape=(7.2, 8))
    with pytest.raises(ValueError):
        t4 = Tensor(shape=(5, 6), max_shape=(7, 8, 9))
    with pytest.raises(TypeError):
        t5 = Tensor(shape=(5, None), max_shape=(5, None))
    with pytest.raises(TypeError):
        t6 = Tensor(shape=(5, 6), max_shape=(7.2, 8))
    with pytest.raises(ValueError):
        t7 = Tensor(max_shape=(10, 15))
    with pytest.raises(TypeError):
        t8 = Tensor(None)
    with pytest.raises(ValueError):
        t9 = Tensor((5, 6, None))
    with pytest.raises(TypeError):
        t10 = Tensor(max_shape="abc")
    with pytest.raises(TypeError):
        t11 = Tensor(max_shape=(7.4, 2))
    with pytest.raises(ValueError):
        t12 = Tensor(max_shape=[])