Python Dataset.Dataset примеры, hub.Dataset.Dataset Python примеры использования

Пример #1

0

Показать файл

def test_check_label_name():
    my_schema = {"label": ClassLabel(names=["red", "green", "blue"])}
    ds = Dataset("./data/test/dataset2", shape=(5,), mode="w", schema=my_schema)
    ds["label", 0] = 1
    ds["label", 1] = 2
    ds["label", 0] = 1
    ds["label", 1] = 2
    ds["label", 2] = 0
    assert ds.compute(label_name=True) == [
        {"label": "green"},
        {"label": "blue"},
        {"label": "red"},
        {"label": "red"},
        {"label": "red"},
    ]
    assert ds.compute() == [
        {"label": 1},
        {"label": 2},
        {"label": 0},
        {"label": 0},
        {"label": 0},
    ]
    assert ds[1].compute(label_name=True) == {"label": "blue"}
    assert ds[1].compute() == {"label": 2}
    assert ds[1:3].compute(label_name=True) == [{"label": "blue"}, {"label": "red"}]
    assert ds[1:3].compute() == [{"label": 2}, {"label": 0}]

Пример #2

0

Показать файл

Файл: test_dataset.py Проект: thomascherickal/Hub

def test_tensorview_slicing():
    dt = {"first": Tensor(shape=(None, None), max_shape=(250, 300))}
    ds = Dataset(schema=dt, shape=(20, ), url="./data/test/model", mode="w")
    tv = ds["first", 5:6, 7:10, 9:10]
    assert tv.numpy().shape == tuple(tv.shape) == (1, 3, 1)
    tv2 = ds["first", 5:6, 7:10, 9]
    assert tv2.numpy().shape == tuple(tv2.shape) == (1, 3)

Пример #3

0

Показать файл

def test_meta_information():
    description = {
        "author": "testing",
        "description": "here goes the testing text"
    }

    description_changed = {
        "author": "changed author",
        "description": "now it's changed",
    }

    schema = {"text": Text((None, ), max_shape=(1000, ))}

    ds = Dataset(
        "./data/test_meta",
        shape=(10, ),
        schema=schema,
        meta_information=description,
        mode="w",
    )

    some_text = ["hello world", "hello penguin", "hi penguin"]

    for i, text in enumerate(some_text):
        ds["text", i] = text

    assert type(ds.meta["meta_info"]) == dict
    assert ds.meta["meta_info"]["author"] == "testing"
    assert ds.meta["meta_info"]["description"] == "here goes the testing text"

    ds.close()

Пример #4

0

Показать файл

Файл: test_dataset.py Проект: thomascherickal/Hub

def test_dataset_dynamic_shaped():
    schema = {
        "first":
        Tensor(
            shape=(None, None),
            dtype="int32",
            max_shape=(100, 100),
            chunks=(100, ),
        )
    }
    ds = Dataset(
        "./data/test/test_dataset_dynamic_shaped",
        token=None,
        shape=(1000, ),
        mode="w",
        schema=schema,
    )

    ds["first", 50, 50:60, 50:60] = np.ones((10, 10), "int32")
    assert (ds["first", 50, 50:60, 50:60].numpy() == np.ones((10, 10),
                                                             "int32")).all()

    ds["first", 0, :10, :10] = np.ones((10, 10), "int32")
    ds["first", 0, 10:20, 10:20] = 5 * np.ones((10, 10), "int32")
    assert (ds["first", 0, 0:10, 0:10].numpy() == np.ones((10, 10),
                                                          "int32")).all()

Пример #5

0

Показать файл

Файл: test_sharded_dataset.py Проект: nosahama/Hub

def test_sharded_dataset_with_views():
    schema = {"first": "float", "second": "float"}
    ds = Dataset("./data/test_sharded_ds",
                 shape=(10, ),
                 schema=schema,
                 mode="w")
    for i in range(10):
        ds[i, "first"] = i
        ds[i, "second"] = 2 * i + 1

    dsv = ds[3:5]
    dsv2 = ds[1]
    dsv3 = ds[8:]
    datasets = [dsv, ds, dsv2, dsv3]
    sharded_ds = ShardedDatasetView(datasets)
    for i in range(2):
        assert sharded_ds[i, "first"].compute() == i + 3
        assert sharded_ds[i, "second"].compute() == 2 * (i + 3) + 1
    for i in range(2, 12):
        assert sharded_ds[i, "first"].compute() == i - 2
        assert sharded_ds[i, "second"].compute() == 2 * (i - 2) + 1
    assert sharded_ds[12, "first"].compute() == 1
    assert sharded_ds[12, "second"].compute() == 3
    for i in range(13, 15):
        assert sharded_ds[i, "first"].compute() == i - 5
        assert sharded_ds[i, "second"].compute() == 2 * (i - 5) + 1

Пример #6

0

Показать файл

Файл: test_dataset.py Проект: istranic/Hub

def test_dataset_dynamic_shaped_slicing():
    schema = {
        "first":
        Tensor(
            shape=(None, None),
            dtype="int32",
            max_shape=(100, 100),
            chunks=(100, ),
        )
    }
    ds = Dataset(
        "./data/test/test_dataset_dynamic_shaped",
        token=None,
        shape=(100, ),
        mode="w",
        schema=schema,
    )

    for i in range(100):
        ds["first", i] = i * np.ones((i, i))
    items = ds["first", 0:100].compute()
    for i in range(100):
        assert (items[i] == i * np.ones((i, i))).all()

    assert (ds["first", 1:2].compute()[0] == np.ones((1, 1))).all()

Пример #7

0

Показать файл

Файл: test_sharded_dataset.py Проект: nosahama/Hub

def test_sharded_dataset():
    dt = {"first": "float", "second": "float"}
    datasets = [
        Dataset(schema=dt,
                shape=(10, ),
                url=f"./data/test/test_dataset/{i}",
                mode="w") for i in range(4)
    ]

    ds = ShardedDatasetView(datasets)

    ds[0]["first"] = 2.3
    assert ds[0]["second"].numpy() != 2.3
    assert ds[30]["first"].numpy() == 0
    assert len(ds) == 40
    assert ds.shape == (40, )
    assert type(ds.schema) == SchemaDict
    assert ds.__repr__() == "ShardedDatasetView(shape=(40,))"
    with pytest.raises(AdvancedSlicingNotSupported):
        ds[5:8]
    ds[4, "first"] = 3
    for _ in ds:
        pass

    ds2 = ShardedDatasetView([])
    assert ds2.identify_shard(5) == (0, 0)

Пример #8

0

Показать файл

def test_dataset_bug_1(url="./data/test/dataset", token=None):
    my_schema = {
        "image": Tensor(
            (None, 1920, 1080, None), "uint8", max_shape=(10, 1920, 1080, 4)
        ),
    }
    ds = Dataset(url, token=token, shape=(10000,), mode="w", schema=my_schema)
    ds["image", 1] = np.ones((2, 1920, 1080, 1))

Пример #9

0

Показать файл

def test_append_dataset():
    dt = {"first": Tensor(shape=(250, 300)), "second": "float"}
    url = "./data/test/model"
    ds = Dataset(schema=dt, shape=(100,), url=url, mode="w")
    ds.append_shape(20)
    ds["first"][0] = np.ones((250, 300))

    assert len(ds) == 120
    assert ds["first"].shape[0] == 120
    assert ds["first", 5:10].shape[0] == 5
    assert ds["second"].shape[0] == 120
    ds.flush()

    ds = Dataset(url)
    assert ds["first"].shape[0] == 120
    assert ds["first", 5:10].shape[0] == 5
    assert ds["second"].shape[0] == 120

Пример #10

0

Показать файл

def test_dataset_2():
    dt = {"first": "float", "second": "float"}
    ds = Dataset(schema=dt, shape=(2,), url="./data/test/test_dataset2", mode="w")
    ds.meta_information["description"] = "This is my description"

    ds["first"][0] = 2.3
    assert ds.meta_information["description"] == "This is my description"
    assert ds["second"][0].numpy() != 2.3

Пример #11

0

Показать файл

def time_random_access(dataset_name="activeloop/mnist",
                       offset=1000,
                       span=1000,
                       field="image"):
    dset = Dataset(dataset_name, cache=False, storage_cache=False)
    with Timer(
            f"{dataset_name} read at offset {offset:03} of length {span:03}"):
        dset[field][offset:offset + span].compute()

Пример #12

0

Показать файл

def test_datasetview_slicing():
    dt = {"first": Tensor((100, 100))}
    ds = Dataset(schema=dt, shape=(20,), url="./data/test/model", mode="w")

    assert ds["first", 0].numpy().shape == (100, 100)
    assert ds["first", 0:1].numpy().shape == (1, 100, 100)
    assert ds[0]["first"].numpy().shape == (100, 100)
    assert ds[0:1]["first"].numpy().shape == (1, 100, 100)

Пример #13

0

Показать файл

def test_append_resize():
    dt = {"first": Tensor(shape=(250, 300)), "second": "float"}
    url = "./data/test/append_resize"
    ds = Dataset(schema=dt, shape=(100,), url=url, mode="a")
    ds.append_shape(20)
    assert len(ds) == 120
    ds.resize_shape(150)
    assert len(ds) == 150

Пример #14

0

Показать файл

def test_dataset2():
    dt = {"first": "float", "second": "float"}
    ds = Dataset(schema=dt,
                 shape=(2, ),
                 url="./data/test/test_dataset2",
                 mode="w")

    ds["first"][0] = 2.3
    assert ds["second"][0].numpy() != 2.3

Пример #15

0

Показать файл

def test_tensorview_slicing():
    dt = {"first": Tensor(shape=(None, None), max_shape=(250, 300))}
    ds = Dataset(schema=dt, shape=(20,), url="./data/test/tensorivew_slicing", mode="w")
    tv = ds["first", 5:6, 7:10, 9:10]
    tv.disable_lazy()
    tv.enable_lazy()
    assert tv.compute()[0].shape == tuple(tv.shape[0]) == (3, 1)
    tv2 = ds["first", 5:6, 7:10, 9]
    assert tv2.numpy()[0].shape == tuple(tv2.shape[0]) == (3,)

Пример #16

0

Показать файл

def test_class_label_2():
    cl1 = ClassLabel(names=["apple", "banana", "cat"])
    cl2 = ClassLabel((None, ), (10, ), names=["apple", "banana", "cat"])
    cl3 = ClassLabel((3, ), names=["apple", "banana", "cat"])
    my_schema = {"cl1": cl1, "cl2": cl2, "cl3": cl3}

    ds = Dataset("./data/cl_2d_3d", schema=my_schema, shape=(10), mode="w")

    ds["cl1", 0] = cl1.str2int("cat")
    ds["cl1", 1] = cl1.str2int("apple")
    ds["cl1", 2] = cl1.str2int("apple")
    ds["cl1", 3:5] = [cl1.str2int("banana"), cl1.str2int("banana")]
    assert ds["cl1", 1].compute(True) == "apple"
    assert ds["cl1", 0:3].compute(True) == ["cat", "apple", "apple"]
    assert ds["cl1", 3:5].compute(True) == ["banana", "banana"]

    ds["cl2", 0] = np.array(
        [cl2.str2int("cat"),
         cl2.str2int("cat"),
         cl2.str2int("apple")])
    ds["cl2", 1] = np.array([cl2.str2int("apple"), cl2.str2int("banana")])
    ds["cl2", 2] = np.array([
        cl2.str2int("cat"),
        cl2.str2int("apple"),
        cl2.str2int("banana"),
        cl2.str2int("apple"),
        cl2.str2int("banana"),
    ])
    ds["cl2", 3] = np.array([cl2.str2int("cat")])
    assert ds["cl2", 0].compute(True) == ["cat", "cat", "apple"]
    assert ds["cl2", 1].compute(True) == ["apple", "banana"]
    assert ds["cl2", 2].compute(True) == [
        "cat", "apple", "banana", "apple", "banana"
    ]
    assert ds["cl2", 3].compute(True) == ["cat"]

    ds["cl3", 0] = np.array(
        [cl3.str2int("apple"),
         cl3.str2int("apple"),
         cl3.str2int("apple")])
    ds["cl3", 1] = np.array(
        [cl3.str2int("banana"),
         cl3.str2int("banana"),
         cl3.str2int("banana")])
    ds["cl3", 2] = np.array(
        [cl3.str2int("cat"),
         cl3.str2int("cat"),
         cl3.str2int("cat")])
    assert ds["cl3", 0].compute(True) == ["apple", "apple", "apple"]
    assert ds["cl3", 1].compute(True) == ["banana", "banana", "banana"]
    assert ds["cl3", 2].compute(True) == ["cat", "cat", "cat"]
    assert ds["cl3", 0:3].compute(True) == [
        ["apple", "apple", "apple"],
        ["banana", "banana", "banana"],
        ["cat", "cat", "cat"],
    ]

Пример #17

0

Показать файл

def test_tensorview_iter():
    schema = {"abc": "int32"}
    ds = Dataset(
        schema=schema, shape=(20,), url="./data/test/tensorivew_slicing", mode="w"
    )
    for i in range(20):
        ds["abc", i] = i
    tv = ds["abc", 3]
    for item in tv:
        assert item.compute() == 3

Пример #18

0

Показать файл

def test_dataset_append_and_read():
    dt = {"first": "float", "second": "float"}
    ds = Dataset(
        schema=dt,
        shape=(2,),
        url="./data/test/test_dataset_append_and_read",
        mode="a",
    )

    ds["first"][0] = 2.3
    assert ds["second"][0].numpy() != 2.3
    ds.close()

    ds = Dataset(
        url="./data/test/test_dataset_append_and_read",
        mode="r",
    )
    ds.delete()
    ds.close()

Пример #19

0

Показать файл

Файл: benchmark_iterate_hub_local_tensorflow.py Проект: stjordanis/Hub-1

def benchmark_iterate_hub_local_tensorflow_setup(
    dataset_name, dataset_split, batch_size, prefetch_factor
):
    dset = Dataset.from_tfds(dataset_name, split=dataset_split)
    path = os.path.join(".", "hub_data", "tfds")
    dset.store(path)
    dset = Dataset(path, cache=False, storage_cache=False, mode="r")

    loader = dset.to_tensorflow().batch(batch_size).prefetch(prefetch_factor)

    return (loader,)

Пример #20

0

Показать файл

Файл: test_dataset.py Проект: thomascherickal/Hub

def test_text_dataset():
    schema = {
        "names": Text(shape=(None, ), max_shape=(1000, ), dtype="int64"),
    }
    ds = Dataset("./data/test/testing_text",
                 mode="w",
                 schema=schema,
                 shape=(10, ))
    text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."
    ds["names", 4] = text
    assert ds["names", 4].numpy() == text

Пример #21

0

Показать файл

def test_dataset_filter_4():
    schema = {
        "img": Image((None, None, 3), max_shape=(100, 100, 3)),
        "cl": ClassLabel(names=["cat", "dog", "horse"]),
    }
    ds = Dataset("./data/tests/filtering_4", shape=(100,), schema=schema, mode="w")
    for i in range(100):
        ds["cl", i] = 0 if i < 10 else 1
        ds["img", i] = i * np.ones((5, 6, 3))
    ds_filtered = ds.filter(lambda x: x["cl"].compute() == 0)
    assert (ds_filtered[3:8, "cl"].compute() == np.zeros((5,))).all()

Пример #22

0

Показать файл

def test_datasetview_repr():
    dt = {
        "first": Tensor(shape=(2,)),
        "second": "float",
        "text": Text(shape=(None,), max_shape=(12,)),
    }
    url = "./data/test/dsv_repr"
    ds = Dataset(schema=dt, shape=(9,), url=url, mode="w", lazy=False)
    dsv = ds[2:]
    print_text = "DatasetView(Dataset(schema=SchemaDict({'first': Tensor(shape=(2,), dtype='float64'), 'second': 'float64', 'text': Text(shape=(None,), dtype='int64', max_shape=(12,))}), url='./data/test/dsv_repr', shape=(9,), mode='w'))"
    assert dsv.__repr__() == print_text

Пример #23

0

Показать файл

def benchmark():
    schema = {"image": Tensor((256, 256, 3), dtype="uint8")}
    arr = (np.random.rand(256, 256, 3) * 100).astype("uint8")
    # ds = Dataset("s3://snark-test/superficial_dataset", mode="w", schema=schema, shape=(5000,))
    # for i in tqdm(range(len(ds))):
    #     ds["image", i] = arr
    # ds.close()
    ds = Dataset("s3://snark-test/superficial_dataset")
    tds = ds.to_pytorch()
    dl = torch.utils.data.DataLoader(tds, batch_size=32, num_workers=16)
    for i, b in enumerate(tqdm(dl)):
        pass

Пример #24

0

Показать файл

Файл: test_sharded_dataset.py Проект: stjordanis/Hub-1

def test_sharded_dataset_advanced_slice():
    schema = {"first": "float", "second": "float"}
    ds = Dataset("./data/test_sharded_ds", shape=(10,), schema=schema, mode="w")
    for i in range(10):
        ds[i, "first"] = i
        ds[i, "second"] = 2 * i + 1

    dsv = ds[3:5]
    dsv2 = ds[1]
    dsv3 = ds[8:]
    datasets = [dsv, ds, dsv2, dsv3]
    sharded_ds = ShardedDatasetView(datasets)
    assert sharded_ds["first", :].compute().tolist() == [
        3,
        4,
        0,
        1,
        2,
        3,
        4,
        5,
        6,
        7,
        8,
        9,
        1,
        8,
        9,
    ]
    assert sharded_ds["first"].compute().tolist() == [
        3,
        4,
        0,
        1,
        2,
        3,
        4,
        5,
        6,
        7,
        8,
        9,
        1,
        8,
        9,
    ]
    assert sharded_ds["first", -4:].compute().tolist() == [9, 1, 8, 9]
    assert sharded_ds[1:3].compute()[0] == {"first": 4.0, "second": 9.0}
    assert sharded_ds[1:3].compute()[1] == {"first": 0.0, "second": 1.0}
    sharded_ds["first", 1:5] = [10, 11, 12, 13]
    assert sharded_ds["first", 1:5].compute().tolist() == [10, 11, 12, 13]
    sharded_ds["first", 12] = 50
    assert sharded_ds["first", 12].compute() == 50

Пример #25

0

Показать файл

Файл: benchmark_dataset_comparison.py Проект: zomglings/Hub

def time_iter_hub_wasabi_tensorflow(
    dataset_info, batch_size=BATCH_SIZE, prefetch_factor=PREFETCH_SIZE, process=None
):
    dset = Dataset(dataset_info["hub_name"], cache=False, storage_cache=False, mode="r")
    loader = dset.to_tensorflow().batch(batch_size).prefetch(prefetch_factor)

    with Timer("Hub (remote - Wasabi) `.to_tensorflow()`"):
        for batch in loader:
            image = batch["image"]
            label = batch["label"]
            if process is not None:
                process(image, label)

Пример #26

0

Показать файл

def test_dataset_filter():
    def abc_filter(sample):
        return sample["ab"].compute().startswith("abc")

    my_schema = {"img": Tensor((100, 100)), "ab": Text((None,), max_shape=(10,))}
    ds = Dataset("./data/new_filter", shape=(10,), schema=my_schema)
    for i in range(10):
        ds["img", i] = i * np.ones((100, 100))
        ds["ab", i] = "abc" + str(i) if i % 2 == 0 else "def" + str(i)

    ds2 = ds.filter(abc_filter)
    assert ds2.indexes == [0, 2, 4, 6, 8]

Пример #27

0

Показать файл

def test_dataset_copy_azure_local():
    token = {"account_key": os.getenv("ACCOUNT_KEY")}
    ds = Dataset(
        "https://activeloop.blob.core.windows.net/activeloop-hub/cp_original_test_ds_azure_1",
        token=token,
        shape=(100,),
        schema=simple_schema,
    )
    DS2_PATH = "./data/testing/cp_copy_ds_local_4"
    DS3_PATH = "https://activeloop.blob.core.windows.net/activeloop-hub/cp_copy_test_ds_azure_2"
    for i in range(100):
        ds["num", i] = 2 * i
    try:
        ds2 = ds.copy(DS2_PATH)
    except:
        dsi = Dataset(DS2_PATH)
        dsi.delete()
        ds2 = ds.copy(DS2_PATH)

    try:
        ds3 = ds2.copy(
            DS3_PATH,
            token=token,
        )
    except:
        dsi = Dataset(
            DS3_PATH,
            token=token,
        )
        dsi.delete()
        ds3 = ds2.copy(
            DS3_PATH,
            token=token,
        )
    for i in range(100):
        assert ds2["num", i].compute() == 2 * i
        assert ds3["num", i].compute() == 2 * i
    ds.delete()
    ds2.delete()
    ds3.delete()

Пример #28

0

Показать файл

def test_datasetview_get_dictionary():
    ds = Dataset(
        schema=my_schema,
        shape=(20, ),
        url="./data/test/datasetview_get_dictionary",
        mode="w",
    )
    ds["label", 5, "a"] = 5 * np.ones((100, 200))
    ds["label", 5, "d", "e"] = 3 * np.ones((5, 3))
    dsv = ds[2:10]
    dic = dsv[3, "label"]
    assert (dic["a"].compute() == 5 * np.ones((100, 200))).all()
    assert (dic["d"]["e"].compute() == 3 * np.ones((5, 3))).all()

Пример #29

0

Показать файл

def test_dataset_with_chunks():
    ds = Dataset(
        "./data/test/dataset_with_chunks",
        token=None,
        shape=(10000, ),
        mode="w",
        schema=my_schema_with_chunks,
    )
    ds["label/a", 5, 50, 50] = 8
    assert ds["label/a", 5, 50, 50].numpy() == 8
    ds["image", 5, 4, 100:200, 150:300, :] = np.ones((100, 150, 3), "uint8")
    assert (ds["image", 5, 4, 100:200, 150:300, :].numpy() == np.ones(
        (100, 150, 3), "uint8")).all()

Пример #30

0

Показать файл

def test_text_dataset():
    schema = {
        "names": Text(shape=(None,), max_shape=(1000,), dtype="int64"),
    }
    ds = Dataset("./data/test/testing_text", mode="w", schema=schema, shape=(10,))
    text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."
    ds["names", 4] = text + "4"
    assert ds["names", 4].numpy() == text + "4"
    ds["names"][5] = text + "5"
    assert ds["names"][5].numpy() == text + "5"
    dsv = ds[7:9]
    dsv["names", 0] = text + "7"
    assert dsv["names", 0].numpy() == text + "7"
    dsv["names"][1] = text + "8"
    assert dsv["names"][1].numpy() == text + "8"

    schema2 = {
        "id": Text(shape=(4,), dtype="int64"),
    }
    ds2 = Dataset("./data/test/testing_text_2", mode="w", schema=schema2, shape=(10,))
    ds2[0:5, "id"] = ["abcd", "efgh", "ijkl", "mnop", "qrst"]
    assert ds2[2:4, "id"].compute() == ["ijkl", "mnop"]

Python Dataset.Dataset примеры использования