Пример #1
0
def test_text_dataset_tokenizer():
    schema = {
        "names": Text(shape=(None,), max_shape=(1000,), dtype="int64"),
    }
    ds = Dataset(
        "./data/test/testing_text", mode="w", schema=schema, shape=(10,), tokenizer=True
    )
    text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."
    ds["names", 4] = text + " 4"
    assert ds["names", 4].numpy() == text + " 4"
    ds["names"][5] = text + " 5"
    assert ds["names"][5].numpy() == text + " 5"
    dsv = ds[7:9]
    dsv["names", 0] = text + " 7"
    assert dsv["names", 0].numpy() == text + " 7"
    dsv["names"][1] = text + " 8"
    assert dsv["names"][1].numpy() == text + " 8"

    schema2 = {
        "id": Text(shape=(4,), dtype="int64"),
    }
    ds2 = Dataset(
        "./data/test/testing_text_2",
        mode="w",
        schema=schema2,
        shape=(10,),
        tokenizer=True,
    )
    ds2[0:5, "id"] = ["abcd", "abcd", "abcd", "abcd", "abcd"]
    assert ds2[2:4, "id"].compute() == ["abcd", "abcd"]
Пример #2
0
 def dict_to_hub(dic, path=""):
     d = {}
     for k, v in dic.items():
         k = k.replace("/", "_")
         cur_path = path + "/" + k
         if isinstance(v, dict):
             d[k] = dict_to_hub(v, path=cur_path)
         else:
             value_shape = v.shape if hasattr(v, "shape") else ()
             if isinstance(v, torch.Tensor):
                 v = v.numpy()
             shape = tuple(None for it in value_shape)
             max_shape = (
                 max_dict[cur_path] or tuple(10000 for it in value_shape)
                 if not isinstance(v, str)
                 else (10000,)
             )
             dtype = v.dtype.name if hasattr(v, "dtype") else type(v)
             dtype = "int64" if isinstance(v, str) else dtype
             d[k] = (
                 Tensor(shape=shape, dtype=dtype, max_shape=max_shape)
                 if not isinstance(v, str)
                 else Text(shape=(None,), dtype=dtype, max_shape=max_shape)
             )
     return SchemaDict(d)
Пример #3
0
def test_meta_information():
    description = {"author": "testing", "description": "here goes the testing text"}

    description_changed = {
        "author": "changed author",
        "description": "now it's changed",
    }

    schema = {"text": Text((None,), max_shape=(1000,))}

    ds = Dataset(
        "./data/test_meta",
        shape=(10,),
        schema=schema,
        meta_information=description,
        mode="w",
    )

    some_text = ["hello world", "hello penguin", "hi penguin"]

    for i, text in enumerate(some_text):
        ds["text", i] = text

    assert type(ds.meta["meta_info"]) == dict
    assert ds.meta["meta_info"]["author"] == "testing"
    assert ds.meta["meta_info"]["description"] == "here goes the testing text"

    ds.close()
Пример #4
0
def test_threaded():
    init_schema = {
        "image":
        Tensor(shape=(None, None, None),
               max_shape=(4, 224, 224),
               dtype="float32")
    }
    schema = {
        "image":
        Tensor(shape=(None, None, None),
               max_shape=(4, 224, 224),
               dtype="float32"),
        "label":
        Tensor(shape=(None, ), max_shape=(6, ), dtype="uint8"),
        "text_label":
        Text((None, ), "int64", (14, )),
        "flight_code":
        Text((None, ), "int64", (10, )),
    }

    ds_init = hub.Dataset(
        "./data/hub/new_pipeline_threaded2",
        mode="w",
        shape=(10, ),
        schema=init_schema,
        cache=False,
    )

    for i in range(len(ds_init)):
        ds_init["image", i] = np.ones((4, 220, 224))
        ds_init["image", i] = np.ones((4, 221, 224))

    @hub.transform(schema=schema, scheduler="threaded", workers=2)
    def create_classification_dataset(sample):
        ts = sample["image"]
        return [{
            "image": ts,
            "label": np.ones((6, )),
            "text_label": "PLANTED",
            "flight_code": "UYKNTHNXR",
        } for _ in range(5)]

    ds = create_classification_dataset(ds_init).store(
        "./data/hub/new_pipeline_threaded_final")

    assert ds["image", 0].shape[1] == 221
Пример #5
0
def test_text():
    my_schema = {"text": Text((None, ), max_shape=(10, ))}

    @hub.transform(schema=my_schema)
    def my_transform(sample):
        return {"text": np.array("abc")}

    ds = my_transform([i for i in range(10)])
    ds2 = ds.store("./data/test/transform_text")
    for i in range(10):
        assert ds2["text", i].compute() == "abc"
Пример #6
0
    def tensor_to_hub(tf_dt, max_shape=None):
        if tf_dt.dtype.name == "string":
            max_shape = max_shape or (100000, )
            return Text(shape=(None, ), dtype="int64", max_shape=(100000, ))
        dt = tf_dt.dtype.name
        if max_shape and len(max_shape) > len(tf_dt.shape):
            max_shape = max_shape[(len(max_shape) - len(tf_dt.shape)):]

        max_shape = max_shape or tuple(10000 if dim is None else dim
                                       for dim in tf_dt.shape)
        return Tensor(shape=tf_dt.shape, dtype=dt, max_shape=max_shape)
Пример #7
0
def test_datasetview_repr():
    dt = {
        "first": Tensor(shape=(2,)),
        "second": "float",
        "text": Text(shape=(None,), max_shape=(12,)),
    }
    url = "./data/test/dsv_repr"
    ds = Dataset(schema=dt, shape=(9,), url=url, mode="w", lazy=False)
    dsv = ds[2:]
    print_text = "DatasetView(Dataset(schema=SchemaDict({'first': Tensor(shape=(2,), dtype='float64'), 'second': 'float64', 'text': Text(shape=(None,), dtype='int64', max_shape=(12,))}), url='./data/test/dsv_repr', shape=(9,), mode='w'))"
    assert dsv.__repr__() == print_text
Пример #8
0
def test_text_dataset():
    schema = {
        "names": Text(shape=(None, ), max_shape=(1000, ), dtype="int64"),
    }
    ds = Dataset("./data/test/testing_text",
                 mode="w",
                 schema=schema,
                 shape=(10, ))
    text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."
    ds["names", 4] = text
    assert ds["names", 4].numpy() == text
Пример #9
0
def test_dataset_filter():
    def abc_filter(sample):
        return sample["ab"].compute().startswith("abc")

    my_schema = {"img": Tensor((100, 100)), "ab": Text((None,), max_shape=(10,))}
    ds = Dataset("./data/new_filter", shape=(10,), schema=my_schema)
    for i in range(10):
        ds["img", i] = i * np.ones((100, 100))
        ds["ab", i] = "abc" + str(i) if i % 2 == 0 else "def" + str(i)

    ds2 = ds.filter(abc_filter)
    assert ds2.indexes == [0, 2, 4, 6, 8]
Пример #10
0
def test_dataset_lazy():
    dt = {
        "first": Tensor(shape=(2, )),
        "second": "float",
        "text": Text(shape=(None, ), max_shape=(12, )),
    }
    url = "./data/test/ds_lazy"
    ds = Dataset(schema=dt, shape=(2, ), url=url, mode="w", lazy=False)
    ds["text", 1] = "hello world"
    ds["second", 0] = 3.14
    ds["first", 0] = np.array([5, 6])
    assert ds["text", 1] == "hello world"
    assert ds["second", 0] == 3.14
    assert (ds["first", 0] == np.array([5, 6])).all()
Пример #11
0
def test_dataset_setting_shape():
    schema = {"text": Text(shape=(None,), dtype="int64", max_shape=(10,))}

    url = "./data/test/text_data"
    ds = Dataset(schema=schema, shape=(5,), url=url, mode="w")
    slice_ = slice(0, 5, None)
    key = "text"
    batch = [
        np.array("THTMLY2F9"),
        np.array("QUUVEU2IU"),
        np.array("8ZUFCYWKD"),
        "H9EDFAGHB",
        "WDLDYN6XG",
    ]
    shape = ds._tensors[f"/{key}"].get_shape_from_value([slice_], batch)
    assert shape[0][0] == [1]
Пример #12
0
def test_datasetview_2():
    dt = {
        "first": Tensor(shape=(2,)),
        "second": "float",
        "text": Text(shape=(None,), max_shape=(12,)),
    }
    ds = Dataset("./data/test/dsv_2/", schema=dt, shape=(9,), mode="w")
    dsv = ds[2:]
    with pytest.raises(ValueError):
        dsv[3] = np.ones((3, 5))

    with pytest.raises(KeyError):
        dsv["abc"] = np.ones((3, 5))
    dsv["second"] = np.array([0, 1, 2, 3, 4, 5, 6])
    for i in range(7):
        assert dsv[i, "second"].compute() == i
Пример #13
0
 def dict_to_hub(d):
     for k, v in d.items():
         k = k.replace("/", "_")
         if isinstance(v, dict):
             d[k] = dict_to_hub(v)
         else:
             value_shape = v.shape if hasattr(v, "shape") else ()
             shape = tuple([None for it in value_shape])
             max_shape = tuple([10000 for it in value_shape])
             if isinstance(v, torch.Tensor):
                 v = v.numpy()
             dtype = v.dtype.name if hasattr(v, "dtype") else type(v)
             dtype = "int64" if isinstance(v, str) else dtype
             d[k] = (
                 Tensor(shape=shape, dtype=dtype, max_shape=max_shape)
                 if not isinstance(v, str) else Text(
                     shape=(None, ), dtype=dtype, max_shape=(10000, )))
     return SchemaDict(d)
Пример #14
0
def test_dataset_view_lazy():
    dt = {
        "first": Tensor(shape=(2,)),
        "second": "float",
        "text": Text(shape=(None,), max_shape=(12,)),
    }
    url = "./data/test/dsv_lazy"
    ds = Dataset(schema=dt, shape=(4,), url=url, mode="w")
    ds["text", 3] = "hello world"
    ds["second", 2] = 3.14
    ds["first", 2] = np.array([5, 6])
    dsv = ds[2:]
    dsv.disable_lazy()
    assert dsv["text", 1] == "hello world"
    assert dsv["second", 0] == 3.14
    assert (dsv["first", 0] == np.array([5, 6])).all()
    dsv.enable_lazy()
    assert dsv["text", 1].compute() == "hello world"
    assert dsv["second", 0].compute() == 3.14
    assert (dsv["first", 0].compute() == np.array([5, 6])).all()
Пример #15
0
def test_dataset_compute():
    dt = {
        "first": Tensor(shape=(2,)),
        "second": "float",
        "text": Text(shape=(None,), max_shape=(12,)),
    }
    url = "./data/test/ds_compute"
    ds = Dataset(schema=dt, shape=(2,), url=url, mode="w")
    ds["text", 1] = "hello world"
    ds["second", 0] = 3.14
    ds["first", 0] = np.array([5, 6])
    comp = ds.compute()
    comp0 = comp[0]
    assert (comp0["first"] == np.array([5, 6])).all()
    assert comp0["second"] == 3.14
    assert comp0["text"] == ""
    comp1 = comp[1]
    assert (comp1["first"] == np.array([0, 0])).all()
    assert comp1["second"] == 0
    assert comp1["text"] == "hello world"
Пример #16
0
def test_dataset_assign_value():
    schema = {"text": Text(shape=(None,), dtype="int64", max_shape=(10,))}
    url = "./data/test/text_data"
    ds = Dataset(schema=schema, shape=(7,), url=url, mode="w")
    slice_ = slice(0, 5, None)
    key = "text"
    batch = [
        np.array("THTMLY2F9"),
        np.array("QUUVEU2IU"),
        np.array("8ZUFCYWKD"),
        "H9EDFAGHB",
        "WDLDYN6XG",
    ]
    ds[key, slice_] = batch
    ds[key][5] = np.array("GHLSGBFF8")
    ds[key][6] = "YGFJN75NF"
    assert ds["text", 0].compute() == "THTMLY2F9"
    assert ds["text", 1].compute() == "QUUVEU2IU"
    assert ds["text", 2].compute() == "8ZUFCYWKD"
    assert ds["text", 3].compute() == "H9EDFAGHB"
    assert ds["text", 4].compute() == "WDLDYN6XG"
    assert ds["text", 5].compute() == "GHLSGBFF8"
    assert ds["text", 6].compute() == "YGFJN75NF"
Пример #17
0
 def text_to_hub(tf_dt, max_shape=None):
     max_shape = max_shape or (100000,)
     dt = "int64"
     return Text(shape=(None,), dtype=dt, max_shape=max_shape)
Пример #18
0
        )

        sentences = list(df.sentence.values)
        labels = list(df.label.values)
        data = list(zip(sentences, labels))

        @transform(schema=self.schema)
        def load_transform(sample):
            return {"sentence": sample[0], "labels": sample[1]}

        ds = load_transform(data)
        return ds.store(self.tag)


def main(url, tag, schema):
    R = Retrieve(url, tag, schema)
    R.fetch()
    R.unpack()
    R.push()


if __name__ == "__main__":
    url = "https://nyu-mll.github.io/CoLA/cola_public_1.1.zip"
    tag = "activeloop/CoLA"
    schema = {
        "sentence": Text(shape=(None, ), max_shape=(500, )),
        "labels": Primitive(dtype="int64"),
    }

    main(url, tag, schema)
Пример #19
0
def test_dataset_filter_2():
    my_schema = {
        "fname": Text((None,), max_shape=(10,)),
        "lname": Text((None,), max_shape=(10,)),
    }
    ds = Dataset("./data/tests/filtering", shape=(100,), schema=my_schema, mode="w")
    for i in range(100):
        ds["fname", i] = "John"
        ds["lname", i] = "Doe"

    for i in [1, 3, 6, 15, 63, 96, 75]:
        ds["fname", i] = "Active"

    for i in [15, 31, 25, 75, 3, 6]:
        ds["lname", i] = "loop"

    dsv_combined = ds.filter(
        lambda x: x["fname"].compute() == "Active" and x["lname"].compute() == "loop"
    )
    tsv_combined_fname = dsv_combined["fname"]
    tsv_combined_lname = dsv_combined["lname"]
    for item in dsv_combined:
        assert item.compute() == {"fname": "Active", "lname": "loop"}
    for item in tsv_combined_fname:
        assert item.compute() == "Active"
    for item in tsv_combined_lname:
        assert item.compute() == "loop"
    dsv_1 = ds.filter(lambda x: x["fname"].compute() == "Active")
    dsv_2 = dsv_1.filter(lambda x: x["lname"].compute() == "loop")
    for item in dsv_1:
        assert item.compute()["fname"] == "Active"
    tsv_1 = dsv_1["fname"]
    tsv_2 = dsv_2["lname"]
    for item in tsv_1:
        assert item.compute() == "Active"
    for item in tsv_2:
        assert item.compute() == "loop"
    for item in dsv_2:
        assert item.compute() == {"fname": "Active", "lname": "loop"}
    assert dsv_combined.indexes == [3, 6, 15, 75]
    assert dsv_1.indexes == [1, 3, 6, 15, 63, 75, 96]
    assert dsv_2.indexes == [3, 6, 15, 75]

    dsv_3 = ds.filter(lambda x: x["lname"].compute() == "loop")
    dsv_4 = dsv_3.filter(lambda x: x["fname"].compute() == "Active")
    for item in dsv_3:
        assert item.compute()["lname"] == "loop"
    for item in dsv_4:
        assert item.compute() == {"fname": "Active", "lname": "loop"}
    assert dsv_3.indexes == [3, 6, 15, 25, 31, 75]
    assert dsv_4.indexes == [3, 6, 15, 75]

    my_schema2 = {
        "fname": Text((None,), max_shape=(10,)),
        "lname": Text((None,), max_shape=(10,)),
        "image": Image((1920, 1080, 3)),
    }
    ds = Dataset("./data/tests/filtering2", shape=(100,), schema=my_schema2, mode="w")
    with pytest.raises(KeyError):
        ds.filter(lambda x: (x["random"].compute() == np.ones((1920, 1080, 3))).all())

    for i in [1, 3, 6, 15, 63, 96, 75]:
        ds["fname", i] = "Active"
    dsv = ds.filter(lambda x: x["fname"].compute() == "Active")
    with pytest.raises(KeyError):
        dsv.filter(lambda x: (x["random"].compute() == np.ones((1920, 1080, 3))).all())
Пример #20
0
of our dataset. We have different types of schemas for different
types of data like image, tensor, text. More info. on docs.
"""
mpii_schema = {
    """
    we specify 'shape' as None for variable image size, and we
    give 'max_shape' arguement a maximum possible size of image.
    """
    "image":
    schema.Image(shape=(None, None, 3),
                 max_shape=(1920, 1920, 3),
                 dtype="uint8"),
    "isValidation":
    "float64",
    "img_paths":
    Text(shape=(None, ), max_shape=(15, )),
    "img_width":
    "int32",
    "img_height":
    "int32",
    "objpos":
    Tensor(max_shape=(100, ), dtype="float64"),
    """
    'joint_self' has nested list structure
    """
    "joint_self":
    Tensor(shape=(None, None), max_shape=(100, 100), dtype="float64"),
    "scale_provided":
    "float64",
    "annolist_index":
    "int32",
Пример #21
0
import numpy as np
import zarr

import hub
from hub.schema import Tensor, Image, Text
from hub.utils import Timer

my_schema = {
    "image": Tensor((28, 28, 4), "int32", (28, 28, 4)),
    "label": Text((None, ), "int64", (20, )),
    "confidence": "float",
}

dynamic_schema = {
    "image": Tensor(shape=(None, None, None),
                    dtype="int32",
                    max_shape=(32, 32, 3)),
    "label": Text((None, ), "int64", (20, )),
}


def test_pipeline_basic():
    ds = hub.Dataset("./data/test/test_pipeline_basic",
                     mode="w",
                     shape=(100, ),
                     schema=my_schema)

    for i in range(len(ds)):
        ds["image", i] = np.ones((28, 28, 4), dtype="int32")
        ds["label", i] = f"hello {i}"
        ds["confidence", i] = 0.2