Пример #1
0
def test_check_label_name():
    my_schema = {"label": ClassLabel(names=["red", "green", "blue"])}
    ds = Dataset("./data/test/dataset2", shape=(5,), mode="w", schema=my_schema)
    ds["label", 0] = 1
    ds["label", 1] = 2
    ds["label", 0] = 1
    ds["label", 1] = 2
    ds["label", 2] = 0
    assert ds.compute(label_name=True) == [
        {"label": "green"},
        {"label": "blue"},
        {"label": "red"},
        {"label": "red"},
        {"label": "red"},
    ]
    assert ds.compute() == [
        {"label": 1},
        {"label": 2},
        {"label": 0},
        {"label": 0},
        {"label": 0},
    ]
    assert ds[1].compute(label_name=True) == {"label": "blue"}
    assert ds[1].compute() == {"label": 2}
    assert ds[1:3].compute(label_name=True) == [{"label": "blue"}, {"label": "red"}]
    assert ds[1:3].compute() == [{"label": 2}, {"label": 0}]
Пример #2
0
def test_tensorview_slicing():
    dt = {"first": Tensor(shape=(None, None), max_shape=(250, 300))}
    ds = Dataset(schema=dt, shape=(20, ), url="./data/test/model", mode="w")
    tv = ds["first", 5:6, 7:10, 9:10]
    assert tv.numpy().shape == tuple(tv.shape) == (1, 3, 1)
    tv2 = ds["first", 5:6, 7:10, 9]
    assert tv2.numpy().shape == tuple(tv2.shape) == (1, 3)
Пример #3
0
def test_meta_information():
    description = {
        "author": "testing",
        "description": "here goes the testing text"
    }

    description_changed = {
        "author": "changed author",
        "description": "now it's changed",
    }

    schema = {"text": Text((None, ), max_shape=(1000, ))}

    ds = Dataset(
        "./data/test_meta",
        shape=(10, ),
        schema=schema,
        meta_information=description,
        mode="w",
    )

    some_text = ["hello world", "hello penguin", "hi penguin"]

    for i, text in enumerate(some_text):
        ds["text", i] = text

    assert type(ds.meta["meta_info"]) == dict
    assert ds.meta["meta_info"]["author"] == "testing"
    assert ds.meta["meta_info"]["description"] == "here goes the testing text"

    ds.close()
Пример #4
0
def test_dataset_dynamic_shaped():
    schema = {
        "first":
        Tensor(
            shape=(None, None),
            dtype="int32",
            max_shape=(100, 100),
            chunks=(100, ),
        )
    }
    ds = Dataset(
        "./data/test/test_dataset_dynamic_shaped",
        token=None,
        shape=(1000, ),
        mode="w",
        schema=schema,
    )

    ds["first", 50, 50:60, 50:60] = np.ones((10, 10), "int32")
    assert (ds["first", 50, 50:60, 50:60].numpy() == np.ones((10, 10),
                                                             "int32")).all()

    ds["first", 0, :10, :10] = np.ones((10, 10), "int32")
    ds["first", 0, 10:20, 10:20] = 5 * np.ones((10, 10), "int32")
    assert (ds["first", 0, 0:10, 0:10].numpy() == np.ones((10, 10),
                                                          "int32")).all()
Пример #5
0
def test_sharded_dataset_with_views():
    schema = {"first": "float", "second": "float"}
    ds = Dataset("./data/test_sharded_ds",
                 shape=(10, ),
                 schema=schema,
                 mode="w")
    for i in range(10):
        ds[i, "first"] = i
        ds[i, "second"] = 2 * i + 1

    dsv = ds[3:5]
    dsv2 = ds[1]
    dsv3 = ds[8:]
    datasets = [dsv, ds, dsv2, dsv3]
    sharded_ds = ShardedDatasetView(datasets)
    for i in range(2):
        assert sharded_ds[i, "first"].compute() == i + 3
        assert sharded_ds[i, "second"].compute() == 2 * (i + 3) + 1
    for i in range(2, 12):
        assert sharded_ds[i, "first"].compute() == i - 2
        assert sharded_ds[i, "second"].compute() == 2 * (i - 2) + 1
    assert sharded_ds[12, "first"].compute() == 1
    assert sharded_ds[12, "second"].compute() == 3
    for i in range(13, 15):
        assert sharded_ds[i, "first"].compute() == i - 5
        assert sharded_ds[i, "second"].compute() == 2 * (i - 5) + 1
Пример #6
0
def test_dataset_dynamic_shaped_slicing():
    schema = {
        "first":
        Tensor(
            shape=(None, None),
            dtype="int32",
            max_shape=(100, 100),
            chunks=(100, ),
        )
    }
    ds = Dataset(
        "./data/test/test_dataset_dynamic_shaped",
        token=None,
        shape=(100, ),
        mode="w",
        schema=schema,
    )

    for i in range(100):
        ds["first", i] = i * np.ones((i, i))
    items = ds["first", 0:100].compute()
    for i in range(100):
        assert (items[i] == i * np.ones((i, i))).all()

    assert (ds["first", 1:2].compute()[0] == np.ones((1, 1))).all()
Пример #7
0
def test_sharded_dataset():
    dt = {"first": "float", "second": "float"}
    datasets = [
        Dataset(schema=dt,
                shape=(10, ),
                url=f"./data/test/test_dataset/{i}",
                mode="w") for i in range(4)
    ]

    ds = ShardedDatasetView(datasets)

    ds[0]["first"] = 2.3
    assert ds[0]["second"].numpy() != 2.3
    assert ds[30]["first"].numpy() == 0
    assert len(ds) == 40
    assert ds.shape == (40, )
    assert type(ds.schema) == SchemaDict
    assert ds.__repr__() == "ShardedDatasetView(shape=(40,))"
    with pytest.raises(AdvancedSlicingNotSupported):
        ds[5:8]
    ds[4, "first"] = 3
    for _ in ds:
        pass

    ds2 = ShardedDatasetView([])
    assert ds2.identify_shard(5) == (0, 0)
Пример #8
0
def test_dataset_bug_1(url="./data/test/dataset", token=None):
    my_schema = {
        "image": Tensor(
            (None, 1920, 1080, None), "uint8", max_shape=(10, 1920, 1080, 4)
        ),
    }
    ds = Dataset(url, token=token, shape=(10000,), mode="w", schema=my_schema)
    ds["image", 1] = np.ones((2, 1920, 1080, 1))
Пример #9
0
def test_append_dataset():
    dt = {"first": Tensor(shape=(250, 300)), "second": "float"}
    url = "./data/test/model"
    ds = Dataset(schema=dt, shape=(100,), url=url, mode="w")
    ds.append_shape(20)
    ds["first"][0] = np.ones((250, 300))

    assert len(ds) == 120
    assert ds["first"].shape[0] == 120
    assert ds["first", 5:10].shape[0] == 5
    assert ds["second"].shape[0] == 120
    ds.flush()

    ds = Dataset(url)
    assert ds["first"].shape[0] == 120
    assert ds["first", 5:10].shape[0] == 5
    assert ds["second"].shape[0] == 120
Пример #10
0
def test_dataset_2():
    dt = {"first": "float", "second": "float"}
    ds = Dataset(schema=dt, shape=(2,), url="./data/test/test_dataset2", mode="w")
    ds.meta_information["description"] = "This is my description"

    ds["first"][0] = 2.3
    assert ds.meta_information["description"] == "This is my description"
    assert ds["second"][0].numpy() != 2.3
Пример #11
0
def time_random_access(dataset_name="activeloop/mnist",
                       offset=1000,
                       span=1000,
                       field="image"):
    dset = Dataset(dataset_name, cache=False, storage_cache=False)
    with Timer(
            f"{dataset_name} read at offset {offset:03} of length {span:03}"):
        dset[field][offset:offset + span].compute()
Пример #12
0
def test_datasetview_slicing():
    dt = {"first": Tensor((100, 100))}
    ds = Dataset(schema=dt, shape=(20,), url="./data/test/model", mode="w")

    assert ds["first", 0].numpy().shape == (100, 100)
    assert ds["first", 0:1].numpy().shape == (1, 100, 100)
    assert ds[0]["first"].numpy().shape == (100, 100)
    assert ds[0:1]["first"].numpy().shape == (1, 100, 100)
Пример #13
0
def test_append_resize():
    dt = {"first": Tensor(shape=(250, 300)), "second": "float"}
    url = "./data/test/append_resize"
    ds = Dataset(schema=dt, shape=(100,), url=url, mode="a")
    ds.append_shape(20)
    assert len(ds) == 120
    ds.resize_shape(150)
    assert len(ds) == 150
Пример #14
0
def test_dataset2():
    dt = {"first": "float", "second": "float"}
    ds = Dataset(schema=dt,
                 shape=(2, ),
                 url="./data/test/test_dataset2",
                 mode="w")

    ds["first"][0] = 2.3
    assert ds["second"][0].numpy() != 2.3
Пример #15
0
def test_tensorview_slicing():
    dt = {"first": Tensor(shape=(None, None), max_shape=(250, 300))}
    ds = Dataset(schema=dt, shape=(20,), url="./data/test/tensorivew_slicing", mode="w")
    tv = ds["first", 5:6, 7:10, 9:10]
    tv.disable_lazy()
    tv.enable_lazy()
    assert tv.compute()[0].shape == tuple(tv.shape[0]) == (3, 1)
    tv2 = ds["first", 5:6, 7:10, 9]
    assert tv2.numpy()[0].shape == tuple(tv2.shape[0]) == (3,)
Пример #16
0
def test_class_label_2():
    cl1 = ClassLabel(names=["apple", "banana", "cat"])
    cl2 = ClassLabel((None, ), (10, ), names=["apple", "banana", "cat"])
    cl3 = ClassLabel((3, ), names=["apple", "banana", "cat"])
    my_schema = {"cl1": cl1, "cl2": cl2, "cl3": cl3}

    ds = Dataset("./data/cl_2d_3d", schema=my_schema, shape=(10), mode="w")

    ds["cl1", 0] = cl1.str2int("cat")
    ds["cl1", 1] = cl1.str2int("apple")
    ds["cl1", 2] = cl1.str2int("apple")
    ds["cl1", 3:5] = [cl1.str2int("banana"), cl1.str2int("banana")]
    assert ds["cl1", 1].compute(True) == "apple"
    assert ds["cl1", 0:3].compute(True) == ["cat", "apple", "apple"]
    assert ds["cl1", 3:5].compute(True) == ["banana", "banana"]

    ds["cl2", 0] = np.array(
        [cl2.str2int("cat"),
         cl2.str2int("cat"),
         cl2.str2int("apple")])
    ds["cl2", 1] = np.array([cl2.str2int("apple"), cl2.str2int("banana")])
    ds["cl2", 2] = np.array([
        cl2.str2int("cat"),
        cl2.str2int("apple"),
        cl2.str2int("banana"),
        cl2.str2int("apple"),
        cl2.str2int("banana"),
    ])
    ds["cl2", 3] = np.array([cl2.str2int("cat")])
    assert ds["cl2", 0].compute(True) == ["cat", "cat", "apple"]
    assert ds["cl2", 1].compute(True) == ["apple", "banana"]
    assert ds["cl2", 2].compute(True) == [
        "cat", "apple", "banana", "apple", "banana"
    ]
    assert ds["cl2", 3].compute(True) == ["cat"]

    ds["cl3", 0] = np.array(
        [cl3.str2int("apple"),
         cl3.str2int("apple"),
         cl3.str2int("apple")])
    ds["cl3", 1] = np.array(
        [cl3.str2int("banana"),
         cl3.str2int("banana"),
         cl3.str2int("banana")])
    ds["cl3", 2] = np.array(
        [cl3.str2int("cat"),
         cl3.str2int("cat"),
         cl3.str2int("cat")])
    assert ds["cl3", 0].compute(True) == ["apple", "apple", "apple"]
    assert ds["cl3", 1].compute(True) == ["banana", "banana", "banana"]
    assert ds["cl3", 2].compute(True) == ["cat", "cat", "cat"]
    assert ds["cl3", 0:3].compute(True) == [
        ["apple", "apple", "apple"],
        ["banana", "banana", "banana"],
        ["cat", "cat", "cat"],
    ]
Пример #17
0
def test_tensorview_iter():
    schema = {"abc": "int32"}
    ds = Dataset(
        schema=schema, shape=(20,), url="./data/test/tensorivew_slicing", mode="w"
    )
    for i in range(20):
        ds["abc", i] = i
    tv = ds["abc", 3]
    for item in tv:
        assert item.compute() == 3
Пример #18
0
def test_dataset_append_and_read():
    dt = {"first": "float", "second": "float"}
    ds = Dataset(
        schema=dt,
        shape=(2,),
        url="./data/test/test_dataset_append_and_read",
        mode="a",
    )

    ds["first"][0] = 2.3
    assert ds["second"][0].numpy() != 2.3
    ds.close()

    ds = Dataset(
        url="./data/test/test_dataset_append_and_read",
        mode="r",
    )
    ds.delete()
    ds.close()
def benchmark_iterate_hub_local_tensorflow_setup(
    dataset_name, dataset_split, batch_size, prefetch_factor
):
    dset = Dataset.from_tfds(dataset_name, split=dataset_split)
    path = os.path.join(".", "hub_data", "tfds")
    dset.store(path)
    dset = Dataset(path, cache=False, storage_cache=False, mode="r")

    loader = dset.to_tensorflow().batch(batch_size).prefetch(prefetch_factor)

    return (loader,)
Пример #20
0
def test_text_dataset():
    schema = {
        "names": Text(shape=(None, ), max_shape=(1000, ), dtype="int64"),
    }
    ds = Dataset("./data/test/testing_text",
                 mode="w",
                 schema=schema,
                 shape=(10, ))
    text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."
    ds["names", 4] = text
    assert ds["names", 4].numpy() == text
Пример #21
0
def test_dataset_filter_4():
    schema = {
        "img": Image((None, None, 3), max_shape=(100, 100, 3)),
        "cl": ClassLabel(names=["cat", "dog", "horse"]),
    }
    ds = Dataset("./data/tests/filtering_4", shape=(100,), schema=schema, mode="w")
    for i in range(100):
        ds["cl", i] = 0 if i < 10 else 1
        ds["img", i] = i * np.ones((5, 6, 3))
    ds_filtered = ds.filter(lambda x: x["cl"].compute() == 0)
    assert (ds_filtered[3:8, "cl"].compute() == np.zeros((5,))).all()
Пример #22
0
def test_datasetview_repr():
    dt = {
        "first": Tensor(shape=(2,)),
        "second": "float",
        "text": Text(shape=(None,), max_shape=(12,)),
    }
    url = "./data/test/dsv_repr"
    ds = Dataset(schema=dt, shape=(9,), url=url, mode="w", lazy=False)
    dsv = ds[2:]
    print_text = "DatasetView(Dataset(schema=SchemaDict({'first': Tensor(shape=(2,), dtype='float64'), 'second': 'float64', 'text': Text(shape=(None,), dtype='int64', max_shape=(12,))}), url='./data/test/dsv_repr', shape=(9,), mode='w'))"
    assert dsv.__repr__() == print_text
Пример #23
0
def benchmark():
    schema = {"image": Tensor((256, 256, 3), dtype="uint8")}
    arr = (np.random.rand(256, 256, 3) * 100).astype("uint8")
    # ds = Dataset("s3://snark-test/superficial_dataset", mode="w", schema=schema, shape=(5000,))
    # for i in tqdm(range(len(ds))):
    #     ds["image", i] = arr
    # ds.close()
    ds = Dataset("s3://snark-test/superficial_dataset")
    tds = ds.to_pytorch()
    dl = torch.utils.data.DataLoader(tds, batch_size=32, num_workers=16)
    for i, b in enumerate(tqdm(dl)):
        pass
Пример #24
0
def test_sharded_dataset_advanced_slice():
    schema = {"first": "float", "second": "float"}
    ds = Dataset("./data/test_sharded_ds", shape=(10,), schema=schema, mode="w")
    for i in range(10):
        ds[i, "first"] = i
        ds[i, "second"] = 2 * i + 1

    dsv = ds[3:5]
    dsv2 = ds[1]
    dsv3 = ds[8:]
    datasets = [dsv, ds, dsv2, dsv3]
    sharded_ds = ShardedDatasetView(datasets)
    assert sharded_ds["first", :].compute().tolist() == [
        3,
        4,
        0,
        1,
        2,
        3,
        4,
        5,
        6,
        7,
        8,
        9,
        1,
        8,
        9,
    ]
    assert sharded_ds["first"].compute().tolist() == [
        3,
        4,
        0,
        1,
        2,
        3,
        4,
        5,
        6,
        7,
        8,
        9,
        1,
        8,
        9,
    ]
    assert sharded_ds["first", -4:].compute().tolist() == [9, 1, 8, 9]
    assert sharded_ds[1:3].compute()[0] == {"first": 4.0, "second": 9.0}
    assert sharded_ds[1:3].compute()[1] == {"first": 0.0, "second": 1.0}
    sharded_ds["first", 1:5] = [10, 11, 12, 13]
    assert sharded_ds["first", 1:5].compute().tolist() == [10, 11, 12, 13]
    sharded_ds["first", 12] = 50
    assert sharded_ds["first", 12].compute() == 50
Пример #25
0
def time_iter_hub_wasabi_tensorflow(
    dataset_info, batch_size=BATCH_SIZE, prefetch_factor=PREFETCH_SIZE, process=None
):
    dset = Dataset(dataset_info["hub_name"], cache=False, storage_cache=False, mode="r")
    loader = dset.to_tensorflow().batch(batch_size).prefetch(prefetch_factor)

    with Timer("Hub (remote - Wasabi) `.to_tensorflow()`"):
        for batch in loader:
            image = batch["image"]
            label = batch["label"]
            if process is not None:
                process(image, label)
Пример #26
0
def test_dataset_filter():
    def abc_filter(sample):
        return sample["ab"].compute().startswith("abc")

    my_schema = {"img": Tensor((100, 100)), "ab": Text((None,), max_shape=(10,))}
    ds = Dataset("./data/new_filter", shape=(10,), schema=my_schema)
    for i in range(10):
        ds["img", i] = i * np.ones((100, 100))
        ds["ab", i] = "abc" + str(i) if i % 2 == 0 else "def" + str(i)

    ds2 = ds.filter(abc_filter)
    assert ds2.indexes == [0, 2, 4, 6, 8]
Пример #27
0
def test_dataset_copy_azure_local():
    token = {"account_key": os.getenv("ACCOUNT_KEY")}
    ds = Dataset(
        "https://activeloop.blob.core.windows.net/activeloop-hub/cp_original_test_ds_azure_1",
        token=token,
        shape=(100,),
        schema=simple_schema,
    )
    DS2_PATH = "./data/testing/cp_copy_ds_local_4"
    DS3_PATH = "https://activeloop.blob.core.windows.net/activeloop-hub/cp_copy_test_ds_azure_2"
    for i in range(100):
        ds["num", i] = 2 * i
    try:
        ds2 = ds.copy(DS2_PATH)
    except:
        dsi = Dataset(DS2_PATH)
        dsi.delete()
        ds2 = ds.copy(DS2_PATH)

    try:
        ds3 = ds2.copy(
            DS3_PATH,
            token=token,
        )
    except:
        dsi = Dataset(
            DS3_PATH,
            token=token,
        )
        dsi.delete()
        ds3 = ds2.copy(
            DS3_PATH,
            token=token,
        )
    for i in range(100):
        assert ds2["num", i].compute() == 2 * i
        assert ds3["num", i].compute() == 2 * i
    ds.delete()
    ds2.delete()
    ds3.delete()
Пример #28
0
def test_datasetview_get_dictionary():
    ds = Dataset(
        schema=my_schema,
        shape=(20, ),
        url="./data/test/datasetview_get_dictionary",
        mode="w",
    )
    ds["label", 5, "a"] = 5 * np.ones((100, 200))
    ds["label", 5, "d", "e"] = 3 * np.ones((5, 3))
    dsv = ds[2:10]
    dic = dsv[3, "label"]
    assert (dic["a"].compute() == 5 * np.ones((100, 200))).all()
    assert (dic["d"]["e"].compute() == 3 * np.ones((5, 3))).all()
Пример #29
0
def test_dataset_with_chunks():
    ds = Dataset(
        "./data/test/dataset_with_chunks",
        token=None,
        shape=(10000, ),
        mode="w",
        schema=my_schema_with_chunks,
    )
    ds["label/a", 5, 50, 50] = 8
    assert ds["label/a", 5, 50, 50].numpy() == 8
    ds["image", 5, 4, 100:200, 150:300, :] = np.ones((100, 150, 3), "uint8")
    assert (ds["image", 5, 4, 100:200, 150:300, :].numpy() == np.ones(
        (100, 150, 3), "uint8")).all()
Пример #30
0
def test_text_dataset():
    schema = {
        "names": Text(shape=(None,), max_shape=(1000,), dtype="int64"),
    }
    ds = Dataset("./data/test/testing_text", mode="w", schema=schema, shape=(10,))
    text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."
    ds["names", 4] = text + "4"
    assert ds["names", 4].numpy() == text + "4"
    ds["names"][5] = text + "5"
    assert ds["names"][5].numpy() == text + "5"
    dsv = ds[7:9]
    dsv["names", 0] = text + "7"
    assert dsv["names", 0].numpy() == text + "7"
    dsv["names"][1] = text + "8"
    assert dsv["names"][1].numpy() == text + "8"

    schema2 = {
        "id": Text(shape=(4,), dtype="int64"),
    }
    ds2 = Dataset("./data/test/testing_text_2", mode="w", schema=schema2, shape=(10,))
    ds2[0:5, "id"] = ["abcd", "efgh", "ijkl", "mnop", "qrst"]
    assert ds2[2:4, "id"].compute() == ["ijkl", "mnop"]