def test_append_dataset(): dt = {"first": Tensor(shape=(250, 300)), "second": "float"} url = "./data/test/model" ds = Dataset(schema=dt, shape=(100, ), url=url, mode="w") ds.append_shape(20) ds["first"][0] = np.ones((250, 300)) assert len(ds) == 120 assert ds["first"].shape[0] == 120 assert ds["first", 5:10].shape[0] == 5 assert ds["second"].shape[0] == 120 ds.commit() ds = Dataset(url) assert ds["first"].shape[0] == 120 assert ds["first", 5:10].shape[0] == 5 assert ds["second"].shape[0] == 120
def test_dataset_3(): dt = { "first": Tensor(shape=(2, )), "second": "float", "text": Text(shape=(None, ), max_shape=(12, )), } ds = Dataset("./data/test/ds_3/", schema=dt, shape=(9, ), mode="w") with pytest.raises(ValueError): ds[3, 8] = np.ones((3, 5)) with pytest.raises(KeyError): ds["abc"] = np.ones((3, 5)) ds["second"] = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8]) for i in range(9): assert ds[i, "second"].compute() == i with pytest.raises(ValueError): ds[3, 8].compute()
def get_dataset_from_hub(samples=1, read_from_fs=False, pytorch=False): """ Build dataset and transform to pytorch or tensorflow """ my_schema = {"img": Tensor(shape=(3, 256, 256)), "label": "uint8"} if not read_from_fs: ds = hub.Dataset("test/benchmarking", shape=(samples,), schema=my_schema) else: ds = hub.Dataset( "s3://snark-test/benchmarking_test", shape=(samples,), schema=my_schema ) for i in range(samples): ds["img", i] = np.random.rand(3, 256, 256) ds["label", i] = 0 ds = ds.to_pytorch() if pytorch else ds.to_tensorflow() return ds
def test_minio_endpoint(): token = { "aws_access_key_id": os.getenv("ACTIVELOOP_MINIO_KEY"), "aws_secret_access_key": os.getenv("ACTIVELOOP_MINIO_SECRET_ACCESS_KEY"), "endpoint_url": "https://play.min.io:9000", "region": "us-east-1", } schema = {"abc": Tensor((100, 100, 3))} ds = Dataset( "s3://bucket/random_dataset", token=token, shape=(10,), schema=schema, mode="w" ) for i in range(10): ds["abc", i] = i * np.ones((100, 100, 3)) ds.flush() for i in range(10): assert (ds["abc", i].compute() == i * np.ones((100, 100, 3))).all()
def test_tensorview_dynamicshapes(url="./data/test/dataset", token=None): my_schema = { "image": { "a": { "c": Tensor( shape=(None, None, None), dtype="uint8", max_shape=(10000, 10000, 10000), ), } } } ds = Dataset(url, token=token, shape=(100, ), mode="w", schema=my_schema) ds[0:1]["image", "a", "c", 0] = np.ones((7, 10, 20)) ds[2:3]["image", "a", "c", 0] = 2 * np.ones((7, 10, 20)) print(ds["image", "a", "c", 2].compute())
def test_tensorview_shapes_1(url="./data/test/dataset", token=None): my_schema = { "image": Tensor((None, None, None, None), "uint8", max_shape=(10, 1920, 1080, 4)), "label": float, } ds = Dataset(url, token=token, shape=(100, ), mode="w", schema=my_schema) ds["image", 1] = np.ones((8, 345, 75, 2)) ds["image", 2] = np.ones((5, 345, 90, 3)) assert ds["image", 1:3, 2:4, 300:330].shape.tolist() == [ [2, 30, 75, 2], [2, 30, 90, 3], ] assert ds["image", 0].shape.tolist() == [0, 0, 0, 0] assert ds["label", 5:50].shape.tolist() == [45]
def test_datasetview_filter(): def abc_filter(sample): return sample["ab"].compute().startswith("abc") my_schema = { "img": Tensor((100, 100)), "ab": Text((None, ), max_shape=(10, )) } ds = Dataset("./data/new_filter_2", shape=(10, ), schema=my_schema) for i in range(10): ds["img", i] = i * np.ones((100, 100)) ds["ab", i] = "abc" + str(i) if i % 2 == 0 else "def" + str(i) dsv = ds[2:7] ds2 = dsv.filter(abc_filter) assert ds2.indexes == [2, 4, 6] dsv2 = ds[2] ds3 = dsv2.filter(abc_filter) assert ds3.indexes == 2
def test_dataset_lazy(): dt = { "first": Tensor(shape=(2,)), "second": "float", "text": Text(shape=(None,), max_shape=(12,)), } url = "./data/test/ds_lazy" ds = Dataset(schema=dt, shape=(2,), url=url, mode="w") ds["text", 1] = "hello world" ds["second", 0] = 3.14 ds["first", 0] = np.array([5, 6]) ds.disable_lazy() assert ds["text", 1] == "hello world" assert ds["second", 0] == 3.14 assert (ds["first", 0] == np.array([5, 6])).all() ds.enable_lazy() assert ds["text", 1].compute() == "hello world" assert ds["second", 0].compute() == 3.14 assert (ds["first", 0].compute() == np.array([5, 6])).all()
def test_dataset_compute(): dt = { "first": Tensor(shape=(2,)), "second": "float", "text": Text(shape=(None,), max_shape=(12,)), } url = "./data/test/ds_compute" ds = Dataset(schema=dt, shape=(2,), url=url, mode="w") ds["text", 1] = "hello world" ds["second", 0] = 3.14 ds["first", 0] = np.array([5, 6]) comp = ds.compute() comp0 = comp[0] assert (comp0["first"] == np.array([5, 6])).all() assert comp0["second"] == 3.14 assert comp0["text"] == "" comp1 = comp[1] assert (comp1["first"] == np.array([0, 0])).all() assert comp1["second"] == 0 assert comp1["text"] == "hello world"
def test_tensor_flattening(): t = { "image": Image(shape=(300, 400, 3), dtype="uint8"), "label": Tensor( shape=(5000,), dtype="<U20", ), "gradient": { "x": "int32", "y": "int32", }, } result = tuple(flatten(t)) paths = [r[1] for r in result] dtypes = [r[0] for r in result] assert paths == ["/image", "/label", "/gradient/x", "/gradient/y"] assert isinstance(dtypes[0], Image) assert isinstance(dtypes[1], Tensor) assert isinstance(dtypes[2], Primitive) assert isinstance(dtypes[3], Primitive)
def test_pickleability(url="./data/test/test_dataset_dynamic_shaped"): schema = { "first": Tensor( shape=(None, None), dtype="int32", max_shape=(100, 100), chunks=(100,), ) } ds = Dataset( url=url, token=None, shape=(1000,), mode="w", schema=schema, ) ds["first"][0] = np.ones((10, 10)) pickled_ds = cloudpickle.dumps(ds) new_ds = pickle.loads(pickled_ds) assert np.all(new_ds["first"][0].compute() == ds["first"][0].compute())
def test_dataset_store(): my_schema = {"image": Tensor((100, 100), "uint8"), "abc": "uint8"} ds = Dataset("./test/ds_store", schema=my_schema, shape=(100,)) for i in range(100): ds["image", i] = i * np.ones((100, 100)) ds["abc", i] = i def my_filter(sample): return sample["abc"].compute() % 5 == 0 dsv = ds.filter(my_filter) ds2 = ds.store("./test/ds2_store") for i in range(100): assert (ds2["image", i].compute() == i * np.ones((100, 100))).all() assert ds["abc", i].compute() == i ds3 = dsv.store("./test/ds3_store") for i in range(20): assert (ds3["image", i].compute() == 5 * i * np.ones((100, 100))).all() assert ds3["abc", i].compute() == 5 * i
def test_dataset_casting(): my_schema = { "a": Tensor(shape=(1, ), dtype="float64"), } @transform(schema=my_schema) def my_transform(annotation): return { "a": 2.4, } out_ds = my_transform(range(100)) res_ds = out_ds.store("./data/casting") assert res_ds["a", 30].compute() == np.array([2.4]) ds = Dataset(schema=my_schema, url="./data/casting2", shape=(100, )) for i in range(100): ds["a", i] = 0.2 assert ds["a", 30].compute() == np.array([0.2]) ds2 = Dataset(schema=my_schema, url="./data/casting3", shape=(100, )) ds2["a", 0:100] = np.ones(100, ) assert ds2["a", 30].compute() == np.array([1])
def test_dataset_dynamic_shaped(): schema = { "first": Tensor( shape=(None, None), dtype="int32", max_shape=(100, 100), chunks=(100,), ) } ds = Dataset( "./data/test/test_dataset_dynamic_shaped", token=None, shape=(1000,), mode="w", schema=schema, ) ds["first", 50, 50:60, 50:60] = np.ones((10, 10), "int32") assert (ds["first", 50, 50:60, 50:60].numpy() == np.ones((10, 10), "int32")).all() ds["first", 0, :10, :10] = np.ones((10, 10), "int32") ds["first", 0, 10:20, 10:20] = 5 * np.ones((10, 10), "int32") assert (ds["first", 0, 0:10, 0:10].numpy() == np.ones((10, 10), "int32")).all()
def test_dataset_dynamic_shaped_slicing(): schema = { "first": Tensor( shape=(None, None), dtype="int32", max_shape=(100, 100), chunks=(100,), ) } ds = Dataset( "./data/test/test_dataset_dynamic_shaped", token=None, shape=(100,), mode="w", schema=schema, ) for i in range(100): ds["first", i] = i * np.ones((i, i)) items = ds["first", 0:100].compute() for i in range(100): assert (items[i] == i * np.ones((i, i))).all() assert (ds["first", 1:2].compute()[0] == np.ones((1, 1))).all()
""" License: This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0. If a copy of the MPL was not distributed with this file, You can obtain one at https://mozilla.org/MPL/2.0/. """ import numpy as np import hub from hub.schema import Tensor schema = { "image": Tensor((10, 1920, 1080, 3), "uint8"), "label": { "a": Tensor((100, 200), "int32"), "b": Tensor((100, 400), "int64"), }, } def test_hub_open(): ds = hub.Dataset("./data/test/hub_open", token=None, shape=(10000, ), mode="w", schema=schema) ds["label/a", 5, 50, 50] = 9 assert ds["label/a", 5, 50, 50].numpy() == 9 ds["image", 5, 4, 120:200, 150:300, :] = 3 * np.ones((80, 150, 3), "uint8") assert (ds["image", 5, 4, 120:200, 150:300, :].numpy() == 3 * np.ones( (80, 150, 3), "uint8")).all()
from hub.exceptions import DirectoryNotEmptyException from hub.schema import BBox, ClassLabel, Image, SchemaDict, Sequence, Tensor, Text from hub.schema.class_label import ClassLabel from hub.utils import ( azure_creds_exist, gcp_creds_exist, hub_creds_exist, minio_creds_exist, s3_creds_exist, transformers_loaded, ) Dataset = dataset.Dataset my_schema = { "image": Tensor((10, 1920, 1080, 3), "uint8"), "label": { "a": Tensor((100, 200), "int32", compressor="lz4"), "b": Tensor((100, 400), "int64", compressor="zstd"), "c": Tensor((5, 3), "uint8"), "d": {"e": Tensor((5, 3), "uint8")}, }, } def test_dataset_2(): dt = {"first": "float", "second": "float"} ds = Dataset(schema=dt, shape=(2,), url="./data/test/test_dataset2", mode="w") ds.meta_information["description"] = "This is my description" ds["first"][0] = 2.3
def test_tensor_error(): try: Tensor(None, max_shape=None) except TypeError as ex: assert "shape cannot be None" in str(ex)
def test_dataset_bug_2(url="./data/test/dataset", token=None): my_schema = { "image": Tensor((100, 100), "uint8"), } ds = Dataset(url, token=token, shape=(10000,), mode="w", schema=my_schema) ds["image", 0:1] = [np.zeros((100, 100))]
def test_tensor_error(): try: Tensor(None, max_shape=None) except TypeError as ex: assert "both shape and max_shape cannot be None at the same time" in str(ex)
def test_tensor_repr(): tensor_object = Tensor() tensor_object_2 = Tensor(shape=(5000,), dtype="<U20") assert tensor_object.__repr__() == "Tensor(shape=(None,), dtype='float64')" assert tensor_object_2.__repr__() == "Tensor(shape=(5000,), dtype='<U20')"
def test_tensor_init(): with pytest.raises(ValueError): Tensor(shape=2, max_shape=(2, 2))
from hub import Dataset from hub.api.datasetview import TensorView from hub.exceptions import NoneValueException from hub.schema import Tensor import numpy as np import pytest my_schema = { "image": Tensor((None, None, None, None), "uint8", max_shape=(10, 1920, 1080, 4)), "label": float, } ds = Dataset("./data/test/dataset", shape=(100, ), mode="w", schema=my_schema) def test_tensorview_init(): with pytest.raises(NoneValueException): tensorview_object = TensorView(ds, subpath=None) with pytest.raises(NoneValueException): tensorview_object_2 = TensorView(dataset=None, subpath="image") def test_tensorview_getitem(): images_tensorview = ds["image"] with pytest.raises(IndexError): images_tensorview["7", 0:1920, 0:1080, 0:3].compute()
def test_dataset_change_schema(): schema = { "abc": "uint8", "def": { "ghi": Tensor((100, 100)), "rst": Tensor((100, 100, 100)), }, } ds = Dataset("./data/test_schema_change", schema=schema, shape=(100, )) new_schema_1 = { "abc": "uint8", "def": { "ghi": Tensor((200, 100)), "rst": Tensor((100, 100, 100)), }, } new_schema_2 = { "abrs": "uint8", "def": { "ghi": Tensor((100, 100)), "rst": Tensor((100, 100, 100)), }, } new_schema_3 = { "abc": "uint8", "def": { "ghijk": Tensor((100, 100)), "rst": Tensor((100, 100, 100)), }, } new_schema_4 = { "abc": "uint16", "def": { "ghi": Tensor((100, 100)), "rst": Tensor((100, 100, 100)), }, } new_schema_5 = { "abc": "uint8", "def": { "ghi": Tensor((100, 100, 3)), "rst": Tensor((100, 100, 100)), }, } with pytest.raises(SchemaMismatchException): ds = Dataset("./data/test_schema_change", schema=new_schema_1, shape=(100, )) with pytest.raises(SchemaMismatchException): ds = Dataset("./data/test_schema_change", schema=new_schema_2, shape=(100, )) with pytest.raises(SchemaMismatchException): ds = Dataset("./data/test_schema_change", schema=new_schema_3, shape=(100, )) with pytest.raises(SchemaMismatchException): ds = Dataset("./data/test_schema_change", schema=new_schema_4, shape=(100, )) with pytest.raises(SchemaMismatchException): ds = Dataset("./data/test_schema_change", schema=new_schema_5, shape=(100, ))
import numpy as np import zarr import hub from hub.schema import Tensor, Image, Text from hub.utils import Timer my_schema = { "image": Tensor((28, 28, 4), "int32", (28, 28, 4)), "label": Text((None, ), "int64", (20, )), "confidence": "float", } dynamic_schema = { "image": Tensor(shape=(None, None, None), dtype="int32", max_shape=(32, 32, 3)), "label": Text((None, ), "int64", (20, )), } def test_pipeline_basic(): ds = hub.Dataset("./data/test/test_pipeline_basic", mode="w", shape=(100, ), schema=my_schema) for i in range(len(ds)): ds["image", i] = np.ones((28, 28, 4), dtype="int32") ds["label", i] = f"hello {i}" ds["confidence", i] = 0.2
def benchmark(sample_size=100, width=1000, channels=4, dtype="int8"): numpy_arr = np.zeros((sample_size, width, width, channels), dtype=dtype) zarr_fs = zarr.zeros( (sample_size, width, width, channels), dtype=dtype, store=zarr.storage.FSStore("./data/test/array"), overwrite=True, ) zarr_lmdb = zarr.zeros( (sample_size, width, width, channels), dtype=dtype, store=zarr.storage.LMDBStore("./data/test/array2"), overwrite=True, ) my_schema = { "image": Tensor((width, width, channels), dtype, (width, width, channels)), } ds_fs = hub.Dataset( "./data/test/test_pipeline_basic_3", mode="w", shape=(sample_size, ), schema=my_schema, cache=0, ) ds_fs_cache = hub.Dataset( "./data/test/test_pipeline_basic_2", mode="w", shape=(sample_size, ), schema=my_schema, ) if False: print( f"~~~ Sequential write of {sample_size}x{width}x{width}x{channels} random arrays ~~~" ) for name, arr in [ ("Numpy", numpy_arr), ("Zarr FS", zarr_fs), ("Zarr LMDB", zarr_lmdb), ("Hub FS", ds_fs["image"]), ("Hub FS+Cache", ds_fs_cache["image"]), ]: with Timer(name): for i in range(sample_size): arr[i] = (np.random.rand(width, width, channels) * 255).astype(dtype) print( f"~~~ Pipeline {sample_size}x{width}x{width}x{channels} random arrays ~~~" ) for name, processes in [ ("single", 1), ("processed", 10), ]: # , ("ray", 10), ("green", 10), ("dask", 10)]: @hub.transform(schema=my_schema, scheduler=name, processes=processes) def my_transform(sample): return { "image": (np.random.rand(width, width, channels) * 255).astype(dtype), } with Timer(name): out_ds = my_transform(ds_fs) out_ds.store(f"./data/test/test_pipeline_basic_output_{name}")
def test_dataset_no_shape(url="./data/test/dataset", token=None): try: Tensor(shape=(120, 120, 3), max_shape=(120, 120, 4)) except ValueError: pass
def test_objectview(): schema = SchemaDict({ "a": Tensor((20, 20), dtype=int, max_shape=(20, 20)), "b": Sequence(dtype=BBox(dtype=float)), "c": Sequence(dtype=SchemaDict( {"d": Sequence((), dtype=Tensor((5, 5), dtype=float))})), "e": Sequence(dtype={ "f": { "g": Tensor(5, dtype=int), "h": Tensor((), dtype=int) } }), }) ds = hub.Dataset("./nested_seq", shape=(5, ), mode="w", schema=schema) # dataset view to objectview dv = ds[3:5] dv["c", 0] = {"d": 5 * np.ones((2, 2, 5, 5))} assert (dv[0, "c", 0, "d", 0].compute() == 5 * np.ones((5, 5))).all() # dataset view unsqueezed with pytest.raises(IndexError): dv["c", "d"].compute() # dataset unsqueezed with pytest.raises(IndexError): ds["c", "d"].compute() # tensorview to object view # sequence of tensor ds["b", 0] = 0.5 * np.ones((5, 4)) tv = ds["b", 0] tv[0] = 0.3 * np.ones((4, )) assert (tv[0].compute() == 0.3 * np.ones((4, ))).all() # ds to object view assert (ds[3, "c", "d"].compute() == 5 * np.ones((2, 2, 5, 5))).all() # Sequence of schemadicts ds[0, "e"] = {"f": {"g": np.ones((3, 5)), "h": np.array([42, 25, 15])}} with pytest.raises(KeyError): ds[0, "e", 1].compute() assert (ds[0, "e", "f", "h"].compute() == np.array([42, 25, 15])).all() # With dataset view dv[0, "e"] = {"f": {"g": np.ones((3, 5)), "h": np.array([1, 25, 1])}} # dv[0, "e", 1]["f", "h"] = 25 assert (dv[0, "e", "f", "h"].compute() == np.array([1, 25, 1])).all() # If not lazy mode all slices should be stable ds.lazy = False assert ds[0, "e", 0, "f", "h"] == 42 with pytest.raises(KeyError): ds[0, "e", 1]["f", "h"] == 25 ds.lazy = True # make an objectview ov = ds["c", "d"] with pytest.raises(IndexError): ov.compute() assert (ov[3].compute() == 5 * np.ones((2, 2, 5, 5))).all() # ov[3, 1] = 2 * np.ones((2, 5, 5)) assert (ov[3][0, 0].compute() == 5 * np.ones((5, 5))).all() assert (ov[3][1].compute() == 5 * np.ones((2, 5, 5))).all()
def test_tensor_repr(): tensor_object_2 = Tensor(shape=(5000,), dtype="<U20") assert tensor_object_2.__repr__() == "Tensor(shape=(5000,), dtype='<U20')"
def test_tensor_error_2(): with pytest.raises(TypeError): t1 = Tensor(shape=(5.1)) with pytest.raises(TypeError): t2 = Tensor(shape=(5.1, )) with pytest.raises(TypeError): t3 = Tensor(shape=(5, 6), max_shape=(7.2, 8)) with pytest.raises(ValueError): t4 = Tensor(shape=(5, 6), max_shape=(7, 8, 9)) with pytest.raises(TypeError): t5 = Tensor(shape=(5, None), max_shape=(5, None)) with pytest.raises(TypeError): t6 = Tensor(shape=(5, 6), max_shape=(7.2, 8)) with pytest.raises(ValueError): t7 = Tensor(max_shape=(10, 15)) with pytest.raises(TypeError): t8 = Tensor(None) with pytest.raises(ValueError): t9 = Tensor((5, 6, None)) with pytest.raises(TypeError): t10 = Tensor(max_shape="abc") with pytest.raises(TypeError): t11 = Tensor(max_shape=(7.4, 2)) with pytest.raises(ValueError): t12 = Tensor(max_shape=[])