예제 #1
0
def write_hub(arr, path, overwrite=True):
    """Write a hub dataset to disk
    """
    if os.path.exists(path) and os.path.isdir(path) and overwrite:
        shutil.rmtree(path)

    if os.path.exists(path):
        raise FileExistsError("Output path {} already exists".format(path))

    if arr.ndim == 1:
        schema = {"value": hub.schema.Tensor(arr.shape[0])}
        dataset = hub.Dataset(path, shape=(1, ), schema=schema, mode='w')
        dataset["value", 0][:] = arr.astype(np.float32)
        dataset.flush()
        dataset.close()

    elif arr.ndim == 2:
        schema = {"value": hub.schema.Tensor(arr.shape[1])}
        dataset = hub.Dataset(path,
                              shape=(arr.shape[0], ),
                              schema=schema,
                              mode='w')
        dataset["value"][:] = arr.astype(np.float32)
        dataset.flush()
        dataset.close()
    else:
        raise ValueError("hub backend only supports 1D or 2D arrays")
예제 #2
0
def get_dataset_from_hub(samples=1, read_from_fs=False, pytorch=False):
    """
    Build dataset and transform to pytorch or tensorflow
    """
    my_schema = {"img": Tensor(shape=(3, 256, 256)), "label": "uint8"}
    if not read_from_fs:
        ds = hub.Dataset(
            "kristina/benchmarking",
            shape=(samples, ),
            schema=my_schema,
            cache=False,
        )
    else:
        ds = hub.Dataset(
            "s3://snark-test/benchmarking",
            shape=(samples, ),
            schema=my_schema,
            cache=False,
        )
    for i in range(samples):
        ds["img", i] = np.random.rand(3, 256, 256)
        ds["label", i] = 0
    ds_hub = ds.to_pytorch() if pytorch else ds.to_tensorflow()
    ds = MyDataset(ds_hub)
    return ds
예제 #3
0
def create_large_dataset():
    sample_count = 60  # change this to big number to test

    # Decide schema of the dataset
    schema = {
        "image":
        Tensor((1920, 1080, 3), chunks=(2, 1920, 1080, 3), dtype="float64")
    }
    array = np.random.random((10, 1920, 1080, 3))

    # Write the dataset
    ds = hub.Dataset(
        "./data/examples/large_dataset_build",
        shape=(sample_count, ),
        schema=schema,
    )

    for i in range(len(ds) // 10):
        ds["image", i * 10:i * 10 + 10] = i * array
    ds.commit()

    ds = hub.Dataset("./data/examples/large_dataset_build")
    print(ds.keys, ds["image"].shape, ds["image"].dtype)

    # Read the dataset
    with hub.Dataset("./data/examples/large_dataset_build") as ds:
        for i in range(len(ds) // 10):
            assert (ds["image", i * 10, 0, 0, 0].compute() /
                    array[0, 0, 0, 0]) == i
예제 #4
0
파일: infer.py 프로젝트: kevinlu1211/Hub
def infer_dataset(path, scheduler="single", workers=1):
    # TODO: handle s3 path

    if not os.path.isdir(path):
        raise Exception("input path must be either a directory")

    hub_path = os.path.join("./", path, "hub")

    if os.path.isdir(hub_path):
        print('inferred dataset found in "%s", using that' % hub_path)
        return hub.Dataset(hub_path, mode="r")

    root = _find_root(path)
    ds = None

    directory_parsers = state.get_parsers()
    if len(directory_parsers) <= 0:
        raise Exception("directory parsers list was empty.")

    # go through all functions created using the `directory_parser` decorator in
    # `hub.schema.auto.directory_parsers`
    for parser in directory_parsers:
        ds = parser(root, scheduler, workers)
        if ds is not None:
            break

    if ds is None:
        raise Exception(
            'could not infer dataset for the root "%s". either add a new parser to'
            % root +
            "`hub.schema.auto.directory_parsers` or write a custom transform + schema."
        )

    ds.store(hub_path)  # TODO: handle s3
    return hub.Dataset(hub_path, mode="r")
예제 #5
0
def test_read_mode():
    my_schema = {"abc": "uint8"}
    ds = hub.Dataset("./data/test_versioning/read_ds",
                     schema=my_schema,
                     shape=(10, ))
    ds.checkout("second", create=True)
    ds2 = hub.Dataset("./data/test_versioning/read_ds", mode="r")
    with pytest.raises(ReadModeException):
        ds2.commit("first")
    with pytest.raises(ReadModeException):
        ds2.checkout("third", create=True)
    with pytest.raises(ReadModeException):
        ds2["abc", 4] = 10
예제 #6
0
def test_stacked_transform():
    schema = {"test": Tensor((2, 2), dtype="uint8")}

    @hub.transform(schema=schema)
    def multiply_transform(sample, multiplier=1, times=1):
        if times == 1:
            return {"test": multiplier * sample["test"]}
        else:
            return [{
                "test": multiplier * sample["test"]
            } for i in range(times)]

    @hub.transform(schema=schema)
    def multiply_transform_2(sample, multiplier=1, times=1):
        if times == 1:
            return {"test": multiplier * sample["test"]}
        else:
            return [{
                "test": multiplier * sample["test"]
            } for i in range(times)]

    ds = hub.Dataset("./data/stacked_transform",
                     mode="w",
                     shape=(5, ),
                     schema=schema)
    for i in range(5):
        ds["test", i] = np.ones((2, 2))
    ds1 = multiply_transform(ds, multiplier=2, times=5)
    ds2 = multiply_transform(ds1, multiplier=3, times=2)
    ds3 = multiply_transform_2(ds2, multiplier=5, times=3)
    ds4 = ds3.store("./data/stacked_transform_2")
    assert len(ds4) == 150
    assert (ds4["test", 0].compute() == 30 * np.ones((2, 2))).all()
예제 #7
0
def time_tiledb(dataset, batch_size=1):
    ds = hub.Dataset(dataset)
    if os.path.exists(dataset.split("/")[1] + "_tileDB"):
        ds_tldb = tiledb.open(dataset.split("/")[1] + "_tileDB")
    else:
        if not os.path.exists(dataset.split("/")[1] + "_tileDB"):
            os.makedirs(dataset.split("/")[1] + "_tileDB")
        ds_numpy = np.concatenate(
            (
                ds["image"].compute().reshape(ds.shape[0], -1),
                ds["label"].compute().reshape(ds.shape[0], -1),
            ),
            axis=1,
        )
        ds_tldb = tiledb.from_numpy(
            dataset.split("/")[1] + "_tileDB", ds_numpy)

    assert type(ds_tldb) == tiledb.array.DenseArray

    with Timer("Time"):
        counter = 0
        t0 = time()
        for batch in range(ds.shape[0] // batch_size):
            x, y = (
                ds_tldb[batch * batch_size:(batch + 1) * batch_size, :-1],
                ds_tldb[batch * batch_size:(batch + 1) * batch_size, -1],
            )
            counter += 1
            t1 = time()
            print("Batch", counter, f"dt: {t1 - t0}")
            t0 = t1
예제 #8
0
def test_pipeline_basic():
    ds = hub.Dataset("./data/test/test_pipeline_basic",
                     mode="w",
                     shape=(100, ),
                     schema=my_schema)

    for i in range(len(ds)):
        ds["image", i] = np.ones((28, 28, 4), dtype="int32")
        ds["label", i] = f"hello {i}"
        ds["confidence", i] = 0.2

    @hub.transform(schema=my_schema)
    def my_transform(sample, multiplier: int = 2):
        return {
            "image": sample["image"] * multiplier,
            "label": sample["label"],
            "confidence": sample["confidence"] * multiplier,
        }

    out_ds = my_transform(ds, multiplier=2)
    assert (out_ds["image", 0].compute() == 2).all()
    assert len(list(out_ds)) == 100
    res_ds = out_ds.store("./data/test/test_pipeline_basic_output")
    assert res_ds["label", 5].compute() == "hello 5"
    assert (res_ds["image", 4].compute() == 2 * np.ones(
        (28, 28, 4), dtype="int32")).all()
    assert len(res_ds) == len(out_ds)
    assert res_ds.shape[0] == out_ds.shape[0]
    assert "image" in res_ds.schema.dict_ and "label" in res_ds.schema.dict_
예제 #9
0
def test_pipeline_ray():
    ds = hub.Dataset(
        "./data/test/test_pipeline_basic",
        mode="w",
        shape=(100, ),
        schema=my_schema,
        cache=False,
    )

    for i in range(len(ds)):
        ds["image", i] = np.ones((28, 28, 4), dtype="int32")
        ds["label", i] = f"hello {i}"
        ds["confidence/confidence", i] = 0.2

    @hub.transform(schema=my_schema, scheduler="ray")
    def my_transform(sample, multiplier: int = 2):
        return {
            "image": sample["image"] * multiplier,
            "label": sample["label"],
            "confidence": {
                "confidence": sample["confidence"]["confidence"] * multiplier
            },
        }

    out_ds = my_transform(ds, multiplier=2)
    assert (out_ds["image", 0].compute() == 2).all()
    assert len(list(out_ds)) == 100
    out_ds.store("./data/test/test_pipeline_basic_output")
예제 #10
0
def test_checkout_address_not_found():
    my_schema = {"abc": "uint8"}
    ds = hub.Dataset("./data/test_versioning/ds_address",
                     schema=my_schema,
                     shape=(10, ))
    with pytest.raises(AddressNotFound):
        ds.checkout("second")
예제 #11
0
def test_pipeline():

    ds = hub.Dataset("./data/test/test_pipeline_multiple2",
                     mode="w",
                     shape=(100, ),
                     schema=my_schema)

    for i in range(len(ds)):
        ds["image", i] = np.ones((28, 28, 4), dtype="int32")
        ds["label", i] = f"hello {i}"
        ds["confidence", i] = 0.2

    with Timer("multiple pipes"):

        @hub.transform(schema=my_schema)
        def my_transform(sample, multiplier: int = 2):
            return {
                "image": sample["image"] * multiplier,
                "label": sample["label"],
                "confidence": sample["confidence"] * multiplier,
            }

        out_ds = my_transform(ds, multiplier=2)
        out_ds = my_transform(out_ds, multiplier=2)
        out_ds = out_ds.store("./data/test/test_pipeline_multiple_4")

        assert (out_ds["image", 0].compute() == 4).all()
예제 #12
0
def example_to_pytorch():
    ds = hub.Dataset("activeloop/fashion_mnist_train")
    torch_ds = ds.to_pytorch(output_type=list)
    torch_dataloader = torch.utils.data.DataLoader(
        torch_ds,
        batch_size=8,
    )
    return torch_dataloader
예제 #13
0
    def __iter__(self):

        if self.dataset is None:
            self.dataset = hub.Dataset(self.path, self.storage)

        for x in self._enumerate(self.dataset):
            x = self.transform(x)
            yield (*list(x), )
예제 #14
0
def test_dataset_with_objects():
    schema = {"images": Tensor(shape=(10,), dtype="object", chunks=(5,))}

    ds = hub.Dataset(
        "./data/test/test_dataset_with_objects", mode="w", shape=(100,), schema=schema
    )
    ds["images", 6, 5] = np.ones((20, 30, 4), dtype="uint8")
    ds.close()
예제 #15
0
def test_old_datasets():
    ds = hub.Dataset("activeloop/mnist")
    with pytest.raises(VersioningNotSupportedException):
        ds.checkout("third")
    with pytest.raises(VersioningNotSupportedException):
        ds.checkout("third", create=True)
    with pytest.raises(VersioningNotSupportedException):
        ds.log()
예제 #16
0
def generate_dataset(shape=(10,), size=(1024, 1024), chunksize=None):
    """
    Generates a datasets with random tensors
    """
    my_schema = {"img": Tensor(shape=shape, chunks=chunksize)}
    ds = hub.Dataset("kristina/benchmarking", shape=(10,), schema=my_schema)
    for i in range(shape):
        ds[i] = np.random.rand(size)
    return ds
예제 #17
0
def get_hub_dataset():
    schema = hub.schema.SchemaDict({
        'text':
        hub.schema.Tensor(shape=(None, ), dtype='int64', max_shape=(2049, ))
    })
    ds = hub.Dataset("snsi/pile_train0", schema=schema,
                     shape=(100000, )).to_pytorch()
    #     ds = hub.Dataset("interneuron/pile_train0", shape=(None,)).to_pytorch()
    return HubAdapter(ds)
예제 #18
0
def test_commit_checkout():
    my_schema = {"img": hub.schema.Tensor((1000, 1000, 3))}
    ds = hub.Dataset("./data/eg_1", shape=(10, ), schema=my_schema, mode="w")

    for i in range(10):
        ds["img", i] = np.ones((1000, 1000, 3))

    first_commit_id = ds.commit("stored all ones")

    for i in range(5):
        ds["img", i] = ds["img", i].compute() * 2

    second_commit_id = ds.commit("multiplied value of some images by 2")

    assert (ds["img", 4].compute() == 2 * np.ones((1000, 1000, 3))).all()

    ds.checkout(first_commit_id)  # now all images are ones again

    for i in range(10):
        assert (ds["img", i].compute() == np.ones((1000, 1000, 3))).all()

    ds.checkout(
        "alternate", create=True
    )  # creating a new branch as we are currently not on the head of master

    for i in range(5):
        ds["img", i] = ds["img", i].compute() * 3

    #  if we had not checked out to "alternate" branch earlier here it would auto checkout to a new branch
    ds.commit("multiplied value of some images by 3")

    assert (ds["img", 4].compute() == 3 * np.ones((1000, 1000, 3))).all()

    ds.checkout(second_commit_id)  # first 5 images are 2s, rest are 1s now

    for i in range(5, 10):
        ds["img", i] = ds["img", i].compute() * 2

    # we are not at the head of master but rather at the last commit, so we automatically get checkouted out to a new branch here
    # this happens any time we try to commit when we are not at the head of the branch
    ds.commit("multiplied value of remaining images by 2")

    for i in range(10):
        assert (ds["img", i].compute() == 2 * np.ones((1000, 1000, 3))).all()

    ds.checkout("alternate")

    for i in range(5, 10):
        ds["img", i] = ds["img", i].compute() * 3

    for i in range(10):
        assert (ds["img", i].compute() == 3 * np.ones((1000, 1000, 3))).all()

    # we are already at the head of alternate so it does not check us out to a new branch, rather we commit on the alternate branch itself
    ds.commit("multiplied value of remaining images by 3")
예제 #19
0
def test_hub_open():
    ds = hub.Dataset("./data/test/hub_open",
                     token=None,
                     shape=(10000, ),
                     mode="w",
                     schema=schema)
    ds["label/a", 5, 50, 50] = 9
    assert ds["label/a", 5, 50, 50].numpy() == 9
    ds["image", 5, 4, 120:200, 150:300, :] = 3 * np.ones((80, 150, 3), "uint8")
    assert (ds["image", 5, 4, 120:200, 150:300, :].numpy() == 3 * np.ones(
        (80, 150, 3), "uint8")).all()
예제 #20
0
def test_auto_checkout_bug():
    my_schema = {"abc": "uint8"}
    ds = hub.Dataset("./data/test_versioning/branch_bug",
                     shape=(10, ),
                     schema=my_schema,
                     mode="w")
    ds["abc", 0] = 1
    a = ds.commit("it is 1")
    ds["abc", 0] = 2
    b = ds.commit("it is 2")
    c = ds.checkout(a)
    d = ds.checkout("other", True)
    ds["abc", 0] = 3
    e = ds.commit("it is 3")
    ds.checkout(b)
    ds["abc", 0] = 4
    f = ds.commit("it is 4")
    g = ds.checkout(a)
    dsv = ds[0:3]
    dsv["abc", 0] = 5
    h = ds.commit("it is 5")
    i = ds.checkout(e)
    tsv = ds[0:5, "abc"]
    tsv[0] = 6
    j = ds.commit("it is 6")
    ds.log()
    ds.checkout(a)
    assert dsv["abc", 0].compute() == 1
    assert ds["abc", 0].compute() == 1
    ds.checkout(b)
    assert ds["abc", 0].compute() == 2
    ds.checkout(c)
    assert ds["abc", 0].compute() == 1
    ds.checkout(d)
    assert ds["abc", 0].compute() == 3
    ds.checkout(e)
    assert ds["abc", 0].compute() == 3
    ds.checkout(f)
    assert ds["abc", 0].compute() == 4
    ds.checkout(g)
    assert ds["abc", 0].compute() == 1
    ds.checkout(h)
    assert ds["abc", 0].compute() == 5
    ds.checkout(i)
    assert ds["abc", 0].compute() == 3
    ds.checkout(j)
    assert ds["abc", 0].compute() == 6
    ds.checkout("master")
    assert ds["abc", 0].compute() == 2
    ds["abc", 0] = 7
    ds.checkout("copy", True)
    assert ds["abc", 0].compute() == 7
    ds.checkout("other")
    assert ds["abc", 0].compute() == 3
예제 #21
0
def time_hub(dataset, batch_size=1, num_batches=1, local=True, user=None):
    my_schema = {
        "image": hub.schema.Image(shape=(28, 28, 1), dtype="uint8"),
        "label": hub.schema.ClassLabel(num_classes=10),
    }
    if local is True:
        ds = hub.Dataset(
            "./" + dataset + "_hub",
            shape=(batch_size * num_batches, ),
            schema=my_schema,
            mode="w",
        )
    else:
        ds = hub.Dataset(
            user + "/" + dataset,
            shape=(batch_size * num_batches, ),
            schema=my_schema,
            mode="w",
        )

    assert type(ds) == hub.api.dataset.Dataset
    time_batches(ds, batch_size, num_batches, hub=True)
예제 #22
0
def bench_hub_compression(times=REPEAT_TIMES):
    arr = np.array(IMG)
    ds = hub.Dataset(
        "./data/bench_png_compression",
        mode="w",
        shape=times,
        schema={"image": hub.schema.Image(arr.shape, compressor="png")},
    )

    batch = np.zeros((times, ) + arr.shape, dtype="uint8")
    for i in range(times):
        batch[i] = arr

    with Timer("Hub compression"):
        ds["image", :times] = batch
예제 #23
0
def benchmark_compress_hub_setup(
        times, image_path="./images/compression_benchmark_image.png"):
    img = Image.open(image_path)
    arr = np.array(img)
    ds = hub.Dataset(
        "./data/bench_png_compression",
        mode="w",
        shape=times,
        schema={"image": hub.schema.Image(arr.shape, compressor="png")},
    )

    batch = np.zeros((times, ) + arr.shape, dtype="uint8")
    for i in range(times):
        batch[i] = arr

    return (ds, times, batch)
예제 #24
0
def bench_hub_compression(img_path=img_path, count=count):
    img = Image.open(img_path)
    arr = np.array(img)
    print(arr.shape)
    ds = hub.Dataset(
        "./data/benchmarks/bench_png_compression",
        mode="w",
        shape=count,
        schema={"image": hub.schema.Image(arr.shape, compressor="png")},
    )
    print(ds._tensors["/image"].chunks)
    bigarr = np.zeros((count, ) + arr.shape, dtype="uint8")
    for i in range(count):
        bigarr[i] = arr

    with Timer("Hub compression"):
        ds["image", :count] = bigarr
예제 #25
0
def test_threaded():
    init_schema = {
        "image":
        Tensor(shape=(None, None, None),
               max_shape=(4, 224, 224),
               dtype="float32")
    }
    schema = {
        "image":
        Tensor(shape=(None, None, None),
               max_shape=(4, 224, 224),
               dtype="float32"),
        "label":
        Tensor(shape=(None, ), max_shape=(6, ), dtype="uint8"),
        "text_label":
        Text((None, ), "int64", (14, )),
        "flight_code":
        Text((None, ), "int64", (10, )),
    }

    ds_init = hub.Dataset(
        "./data/hub/new_pipeline_threaded2",
        mode="w",
        shape=(10, ),
        schema=init_schema,
        cache=False,
    )

    for i in range(len(ds_init)):
        ds_init["image", i] = np.ones((4, 220, 224))
        ds_init["image", i] = np.ones((4, 221, 224))

    @hub.transform(schema=schema, scheduler="threaded", workers=2)
    def create_classification_dataset(sample):
        ts = sample["image"]
        return [{
            "image": ts,
            "label": np.ones((6, )),
            "text_label": "PLANTED",
            "flight_code": "UYKNTHNXR",
        } for _ in range(5)]

    ds = create_classification_dataset(ds_init).store(
        "./data/hub/new_pipeline_threaded_final")

    assert ds["image", 0].shape[1] == 221
예제 #26
0
def time_hub(dataset, batch_size=1):
    ds = hub.Dataset(dataset, cache=False, storage_cache=False, mode="r")

    assert type(ds) == hub.api.dataset.Dataset

    with Timer("Time"):
        counter = 0
        t0 = time()
        for batch in range(ds.shape[0] // batch_size):
            x, y = (
                ds[batch * batch_size : (batch + 1) * batch_size]["image"].compute(),
                ds[batch * batch_size : (batch + 1) * batch_size]["label"].compute(),
            )
            counter += 1
            t1 = time()
            print("Batch", counter, f"dt: {t1 - t0}")
            t0 = t1
예제 #27
0
def test_commit():
    my_schema = {"abc": "uint32"}
    ds = hub.Dataset("./data/test_versioning/eg_1",
                     shape=(10, ),
                     schema=my_schema,
                     mode="w")
    ds["abc", 0] = 1
    a = ds.commit("first")
    ds["abc", 0] = 2
    b = ds.commit("second")
    ds["abc", 0] = 3
    c = ds.commit("third")
    assert ds["abc", 0].compute() == 3
    ds.checkout(a)
    assert ds["abc", 0].compute() == 1
    ds.checkout(b)
    assert ds["abc", 0].compute() == 2
    ds.checkout(c)
    assert ds["abc", 0].compute() == 3
예제 #28
0
def test_commit_checkout_2():
    my_schema = {
        "abc": "uint32",
        "img": Image((1000, 1000, 3), dtype="uint16"),
    }
    ds = hub.Dataset("./data/test_versioning/eg_3",
                     shape=(100, ),
                     schema=my_schema,
                     mode="w")
    for i in range(100):
        ds["img", i] = i * np.ones((1000, 1000, 3))
    a = ds.commit("first")

    # chunk 7.0.0.0 gets rewritten
    ds["img", 21] = 2 * ds["img", 21].compute()

    # the rest part of the chunk stays intact
    assert (ds["img", 21].compute() == 2 * 21 * np.ones((1000, 1000, 3))).all()
    assert (ds["img", 22].compute() == 22 * np.ones((1000, 1000, 3))).all()
    assert (ds["img", 23].compute() == 23 * np.ones((1000, 1000, 3))).all()

    # other chunks are still accessed from original chunk, for eg chunk 11 that contains 35th sample has single copy
    assert (ds["img", 35].compute() == 35 * np.ones((1000, 1000, 3))).all()

    b = ds.commit("second")

    # going back to first commit
    ds.checkout(a)

    # sanity check
    assert (ds["img", 21].compute() == 21 * np.ones((1000, 1000, 3))).all()

    ds.checkout("another", create=True)

    ds["img", 21] = 3 * ds["img", 21].compute()
    assert (ds["img", 21].compute() == 3 * 21 * np.ones((1000, 1000, 3))).all(
    )  # and not 6 * 21 as it would have been, had we checked out from b

    ds.commit("first2")

    ds.checkout("master")
    assert (ds["img", 21].compute() == 2 * 21 * np.ones((1000, 1000, 3))).all()
    ds.log()
예제 #29
0
파일: eurosat.py 프로젝트: stjordanis/Hub-1
def main():
    ds = hub.Dataset("eurosat/eurosat-rgb")

    # 26000 samples in dataset, accessing values
    print(ds["image"][10].numpy())
    print(
        ds["label", 15].numpy()
    )  # alternate way to access, by specifying both key and sample number at once
    print(ds["filename", 20:22].numpy())  # accessing multiple elements at once

    # Splitting into train and test sets
    train_ds = ds[:13000]
    test_ds = ds[13000:]

    # Using hub with tensorflow
    train_tf_ds = train_ds.to_tensorflow().batch(2)

    for batch in train_tf_ds:
        print(batch["label"], batch["filename"], batch["image"])
        break

    test_tf_ds = test_ds.to_tensorflow().batch(2)

    for batch in test_tf_ds:
        print(batch["label"], batch["filename"], batch["image"])
        break

    # Using hub with pytorch
    train_pt_ds = train_ds.to_pytorch()
    train_loader = torch.utils.data.DataLoader(train_pt_ds, batch_size=2)

    for batch in train_loader:
        print(batch["label"], batch["image"]
              )  # pytorch tensors don't support text labels such as filename
        break

    test_pt_ds = test_ds.to_pytorch()
    test_loader = torch.utils.data.DataLoader(test_pt_ds, batch_size=2)
    for batch in test_loader:
        print(batch["label"], batch["image"]
              )  # pytorch tensors don't support text labels such as filename
        break
def main():
    sample_count = 70000
    step = 10
    with Timer("Time"):

        ds = hub.Dataset(
            "./data/examples/mnist_upload_speed_benchmark",
            mode="w",
            schema=schema,
            shape=(sample_count, ),
            cache=2**26,
        )

        arr = (np.random.rand(step, 28, 28) * 100).astype("uint8")

        for i in range(0, sample_count, step):
            # with Timer(f"Sample {i}"):
            ds["image", i:i + step] = arr

        ds.commit()