예제 #1
0
def test_check_label_name():
    my_schema = {"label": ClassLabel(names=["red", "green", "blue"])}
    ds = Dataset("./data/test/dataset2", shape=(5,), mode="w", schema=my_schema)
    ds["label", 0] = 1
    ds["label", 1] = 2
    ds["label", 0] = 1
    ds["label", 1] = 2
    ds["label", 2] = 0
    assert ds.compute(label_name=True).tolist() == [
        {"label": "green"},
        {"label": "blue"},
        {"label": "red"},
        {"label": "red"},
        {"label": "red"},
    ]
    assert ds.compute().tolist() == [
        {"label": 1},
        {"label": 2},
        {"label": 0},
        {"label": 0},
        {"label": 0},
    ]
    assert ds[1].compute(label_name=True) == {"label": "blue"}
    assert ds[1].compute() == {"label": 2}
    assert ds[1:3].compute(label_name=True).tolist() == [
        {"label": "blue"},
        {"label": "red"},
    ]
    assert ds[1:3].compute().tolist() == [{"label": 2}, {"label": 0}]
예제 #2
0
def test_serialize_deserialize():
    t = Tensor(
        shape=(100, 200),
        dtype={
            "image":
            Image(shape=(300, 400, 3), dtype="uint8"),
            "label":
            Tensor(
                shape=(5000, ),
                dtype={
                    "first": {
                        "a": "<U20",
                        "b": "uint32",
                        "c": ClassLabel(num_classes=3),
                    },
                    "second": "float64",
                },
            ),
            "bbox":
            BBox(dtype="float64"),
            "audio":
            Audio(shape=(120, ), dtype="uint32"),
            "mask":
            Mask(shape=(5, 8, 1)),
            "polygon":
            Polygon(shape=(16, 2)),
            "segmentation1":
            Segmentation(shape=(5, 9, 1), dtype="uint8", num_classes=5),
            "segmentation2":
            Segmentation(shape=(5, 9, 1),
                         dtype="uint8",
                         names=("apple", "orange", "pineapple")),
            "sequence":
            Sequence(dtype=Tensor(shape=(None, None),
                                  max_shape=(100, 100),
                                  dtype="uint8"), ),
            "text":
            Text((None, ), max_shape=(10, )),
            "video":
            Video((100, 100, 3, 10)),
        },
    )
    original_result = tuple(t._flatten())
    original_paths = [r.path for r in original_result]
    original_shapes = [r.shape for r in original_result]
    origanal_dtypes = [str(r.dtype) for r in original_result]

    serialize_t = serialize(t)
    deserialize_t = deserialize(serialize_t)
    result = tuple(deserialize_t._flatten())
    paths = [r.path for r in result]
    shapes = [r.shape for r in result]
    dtypes = [str(r.dtype) for r in result]

    assert paths == original_paths
    assert shapes == original_shapes
    assert dtypes == origanal_dtypes
예제 #3
0
    def __init__(
        self,
        shape: Tuple[int, ...] = None,
        dtype: str = None,
        num_classes: int = None,
        names: Tuple[str] = None,
        names_file: str = None,
        max_shape: Tuple[int, ...] = None,
        chunks=None,
        compressor="lz4",
    ):
        """Constructs a Segmentation HubSchema.
        Also constructs ClassLabel HubSchema for Segmentation classes.

        Parameters
        ----------
        shape: tuple of ints or None
            Shape in format (height, width, 1)
        dtype: str
            dtype of segmentation array: `uint16` or `uint8`
        num_classes: int
            Number of classes. All labels must be < num_classes.
        names: `list<str>`
            string names for the integer classes. The order in which the names are provided is kept.
        names_file: str
            Path to a file with names for the integer classes, one per line.
        max_shape : tuple[int]
            Maximum shape of tensor shape if tensor is dynamic
        chunks : tuple[int] | True
            Describes how to split tensor dimensions into chunks (files) to store them efficiently.
            It is anticipated that each file should be ~16MB.
            Sample Count is also in the list of tensor's dimensions (first dimension)
            If default value is chosen, automatically detects how to split into chunks
        """
        super().__init__(shape, dtype, max_shape=max_shape, chunks=chunks)
        self.class_labels = ClassLabel(
            num_classes=num_classes,
            names=names,
            names_file=names_file,
            chunks=chunks,
            compressor="lz4",
        )
예제 #4
0
def test_dataset_filtering_3():
    schema = {
        "img": Image((None, None, 3), max_shape=(100, 100, 3)),
        "cl": ClassLabel(names=["cat", "dog", "horse"]),
    }
    ds = Dataset("./test/filtering_3", shape=(100, ), schema=schema, mode="w")
    for i in range(100):
        ds["cl", i] = 0 if i < 10 else 1
        ds["img", i] = i * np.ones((5, 6, 3))
    ds_filtered = ds.filter({"cl": 0})
    assert (ds_filtered[3:8, "cl"].compute() == np.zeros((5, ))).all()
예제 #5
0
def test_classlabel_repr():
    cl1 = ClassLabel(num_classes=5)
    cl2 = ClassLabel(names=["apple", "orange", "banana"])

    text1 = "ClassLabel(shape=(), dtype='int64', num_classes=5)"
    text2 = "ClassLabel(shape=(), dtype='int64', names=['apple', 'orange', 'banana'], num_classes=3)"
    assert cl1.__repr__() == text1
    assert cl2.__repr__() == text2
예제 #6
0
def test_dataset_filtering_2():
    schema = {
        "img": Image((None, None, 3), max_shape=(100, 100, 3)),
        "cl": ClassLabel(names=["cat", "dog", "horse"]),
    }
    ds = Dataset("./test/filtering_3", shape=(100, ), schema=schema, mode="w")
    for i in range(100):
        ds["cl", i] = 0 if i % 5 == 0 else 1
        ds["img", i] = i * np.ones((5, 6, 3))
    ds["cl", 4] = 2
    ds_filtered = ds.filter({"cl": 0})
    assert ds_filtered.indexes == [5 * i for i in range(20)]
    with pytest.raises(ValueError):
        ds_filtered["img"].compute()
    ds_filtered_2 = ds.filter({"cl": 2})
    assert (ds_filtered_2["img"].compute() == 4 * np.ones((1, 5, 6, 3))).all()
    for item in ds_filtered_2:
        assert (item["img"].compute() == 4 * np.ones((5, 6, 3))).all()
        assert item["cl"].compute() == 2
예제 #7
0
def test_class_label():
    bel1 = ClassLabel(num_classes=4)
    bel2 = ClassLabel(names=["alpha", "beta", "gamma"])
    ClassLabel(names_file=names_file)
    assert bel1.names == ["0", "1", "2", "3"]
    assert bel2.names == ["alpha", "beta", "gamma"]
    assert bel1.str2int("1") == 1
    assert bel2.str2int("gamma") == 2
    assert bel1.int2str(
        2) is None  # FIXME This is a bug, should raise an error
    assert bel2.int2str(0) == "alpha"
    assert bel1.num_classes == 4
    assert bel2.num_classes == 3
    bel1.get_attr_dict()
예제 #8
0
def deserialize(inp):
    if isinstance(inp, dict):
        if inp["type"] == "Audio":
            return Audio(
                shape=tuple(inp["shape"]),
                dtype=deserialize(inp["dtype"]),
                file_format=inp["file_format"],
                sample_rate=inp["sample_rate"],
                max_shape=tuple(inp["max_shape"]),
                chunks=inp["chunks"],
                compressor=_get_compressor(inp),
            )
        elif inp["type"] == "BBox":
            return BBox(
                shape=tuple(inp["shape"]),
                dtype=deserialize(inp["dtype"]),
                chunks=inp["chunks"],
                compressor=_get_compressor(inp),
                max_shape=tuple(inp["max_shape"]),
            )
        elif inp["type"] == "ClassLabel":
            if inp["_names"] is not None:
                return ClassLabel(
                    shape=tuple(inp["shape"]),
                    dtype=deserialize(inp["dtype"]),
                    names=inp["_names"],
                    chunks=inp["chunks"],
                    compressor=_get_compressor(inp),
                    max_shape=tuple(inp["max_shape"]),
                )
            else:
                return ClassLabel(
                    shape=tuple(inp["shape"]),
                    dtype=deserialize(inp["dtype"]),
                    num_classes=inp["_num_classes"],
                    chunks=inp["chunks"],
                    compressor=_get_compressor(inp),
                    max_shape=tuple(inp["max_shape"]),
                )
        elif inp["type"] == "SchemaDict" or inp["type"] == "FeatureDict":
            d = {}
            for k, v in inp["items"].items():
                d[k] = deserialize(v)
            return SchemaDict(d)
        elif inp["type"] == "Image":
            return Image(
                shape=tuple(inp["shape"]),
                dtype=deserialize(inp["dtype"]),
                max_shape=tuple(inp["max_shape"]),
                chunks=inp["chunks"],
                compressor=_get_compressor(inp),
            )
        elif inp["type"] == "Mask":
            return Mask(
                shape=tuple(inp["shape"]),
                max_shape=tuple(inp["max_shape"]),
                chunks=inp["chunks"],
                compressor=_get_compressor(inp),
            )
        elif inp["type"] == "Polygon":
            return Polygon(
                shape=tuple(inp["shape"]),
                max_shape=tuple(inp["max_shape"]),
                dtype=deserialize(inp["dtype"]),
                chunks=inp["chunks"],
                compressor=_get_compressor(inp),
            )
        elif inp["type"] == "Primitive":
            return Primitive(
                dtype=deserialize(inp["dtype"]),
                chunks=inp["chunks"],
                compressor=_get_compressor(inp),
            )
        elif inp["type"] == "Segmentation":
            class_labels = deserialize(inp["class_labels"])
            if class_labels._names is not None:
                return Segmentation(
                    shape=tuple(inp["shape"]),
                    dtype=deserialize(inp["dtype"]),
                    names=class_labels._names,
                    max_shape=tuple(inp["max_shape"]),
                    chunks=inp["chunks"],
                    compressor=_get_compressor(inp),
                )
            else:
                return Segmentation(
                    shape=tuple(inp["shape"]),
                    dtype=deserialize(inp["dtype"]),
                    num_classes=class_labels._num_classes,
                    max_shape=tuple(inp["max_shape"]),
                    chunks=inp["chunks"],
                    compressor=_get_compressor(inp),
                )
        elif inp["type"] == "Sequence":
            return Sequence(
                shape=tuple(inp["shape"]),
                dtype=deserialize(inp["dtype"]),
                max_shape=tuple(inp["max_shape"]),
                chunks=inp["chunks"],
                compressor=_get_compressor(inp),
            )
        elif inp["type"] == "Tensor":
            return Tensor(
                tuple(inp["shape"]),
                deserialize(inp["dtype"]),
                max_shape=tuple(inp["max_shape"]),
                chunks=inp["chunks"],
                compressor=_get_compressor(inp),
            )
        elif inp["type"] == "Text":
            return Text(
                tuple(inp["shape"]),
                deserialize(inp["dtype"]),
                max_shape=tuple(inp["max_shape"]),
                chunks=inp["chunks"],
                compressor=_get_compressor(inp),
            )
        elif inp["type"] == "Video":
            return Video(
                shape=tuple(inp["shape"]),
                dtype=deserialize(inp["dtype"]),
                max_shape=tuple(inp["max_shape"]),
                chunks=inp["chunks"],
                compressor=_get_compressor(inp),
            )
    else:
        return inp
예제 #9
0
def test_class_label():
    cl1 = ClassLabel(num_classes=5)
    cl2 = ClassLabel(names=["apple", "orange", "banana"])
    with pytest.raises(ValueError):
        cl3 = ClassLabel(names=["apple", "orange", "banana", "apple"])
    with pytest.raises(ValueError):
        cl4 = ClassLabel(names=["apple", "orange", "banana", "apple"], num_classes=2)
    cl5 = ClassLabel()
    cl6 = ClassLabel(names_file="./hub/schema/tests/class_label_names.txt")

    assert cl1.names == ["0", "1", "2", "3", "4"]
    assert cl2.names == ["apple", "orange", "banana"]
    assert cl6.names == [
        "alpha",
        "beta",
        "gamma",
    ]
    assert cl1.num_classes == 5
    assert cl2.num_classes == 3
    assert cl1.str2int("3") == 3
    assert cl2.str2int("orange") == 1
    assert cl1.int2str(4) == "4"
    assert cl2.int2str(2) == "banana"

    with pytest.raises(KeyError):
        cl2.str2int("2")
    with pytest.raises(ValueError):
        cl1.str2int("8")
    with pytest.raises(ValueError):
        cl1.str2int("abc")
    with pytest.raises(ValueError):
        cl1.names = ["ab", "cd", "ef", "gh"]
    with pytest.raises(ValueError):
        cl2.names = ["ab", "cd", "ef", "gh"]
예제 #10
0
def test_class_label_2():
    cl1 = ClassLabel(names=["apple", "banana", "cat"])
    cl2 = ClassLabel((None, ), (10, ), names=["apple", "banana", "cat"])
    cl3 = ClassLabel((3, ), names=["apple", "banana", "cat"])
    my_schema = {"cl1": cl1, "cl2": cl2, "cl3": cl3}

    ds = Dataset("./data/cl_2d_3d", schema=my_schema, shape=(10), mode="w")

    ds["cl1", 0] = cl1.str2int("cat")
    ds["cl1", 1] = cl1.str2int("apple")
    ds["cl1", 2] = cl1.str2int("apple")
    ds["cl1", 3:5] = [cl1.str2int("banana"), cl1.str2int("banana")]
    assert ds["cl1", 1].compute(True) == "apple"
    assert ds["cl1", 0:3].compute(True) == ["cat", "apple", "apple"]
    assert ds["cl1", 3:5].compute(True) == ["banana", "banana"]

    ds["cl2", 0] = np.array(
        [cl2.str2int("cat"),
         cl2.str2int("cat"),
         cl2.str2int("apple")])
    ds["cl2", 1] = np.array([cl2.str2int("apple"), cl2.str2int("banana")])
    ds["cl2", 2] = np.array([
        cl2.str2int("cat"),
        cl2.str2int("apple"),
        cl2.str2int("banana"),
        cl2.str2int("apple"),
        cl2.str2int("banana"),
    ])
    ds["cl2", 3] = np.array([cl2.str2int("cat")])
    assert ds["cl2", 0].compute(True) == ["cat", "cat", "apple"]
    assert ds["cl2", 1].compute(True) == ["apple", "banana"]
    assert ds["cl2", 2].compute(True) == [
        "cat", "apple", "banana", "apple", "banana"
    ]
    assert ds["cl2", 3].compute(True) == ["cat"]

    ds["cl3", 0] = np.array(
        [cl3.str2int("apple"),
         cl3.str2int("apple"),
         cl3.str2int("apple")])
    ds["cl3", 1] = np.array(
        [cl3.str2int("banana"),
         cl3.str2int("banana"),
         cl3.str2int("banana")])
    ds["cl3", 2] = np.array(
        [cl3.str2int("cat"),
         cl3.str2int("cat"),
         cl3.str2int("cat")])
    assert ds["cl3", 0].compute(True) == ["apple", "apple", "apple"]
    assert ds["cl3", 1].compute(True) == ["banana", "banana", "banana"]
    assert ds["cl3", 2].compute(True) == ["cat", "cat", "cat"]
    assert ds["cl3", 0:3].compute(True) == [
        ["apple", "apple", "apple"],
        ["banana", "banana", "banana"],
        ["cat", "cat", "cat"],
    ]
예제 #11
0
def deserialize(inp):
    if isinstance(inp, dict):
        if inp["type"] == "Audio":
            return Audio(
                shape=tuple(inp["shape"]),
                dtype=deserialize(inp["dtype"]),
                file_format=inp["file_format"],
                sample_rate=inp["sample_rate"],
                max_shape=tuple(inp["max_shape"]),
                chunks=inp["chunks"],
                compressor=_get_compressor(inp),
            )
        elif inp["type"] == "BBox":
            return BBox(
                dtype=deserialize(inp["dtype"]),
                chunks=inp["chunks"],
                compressor=_get_compressor(inp),
            )
        elif inp["type"] == "ClassLabel":
            if "_num_classes" in inp.keys():
                return ClassLabel(
                    num_classes=inp["_num_classes"],
                    chunks=inp["chunks"],
                    compressor=_get_compressor(inp),
                )
            else:
                return ClassLabel(
                    names=inp["names"],
                    chunks=inp["chunks"],
                    compressor=_get_compressor(inp),
                )
        elif inp["type"] == "SchemaDict" or inp["type"] == "FeatureDict":
            d = {}
            for k, v in inp["items"].items():
                d[k] = deserialize(v)
            return SchemaDict(d)
        elif inp["type"] == "Image":
            return Image(
                shape=tuple(inp["shape"]),
                dtype=deserialize(inp["dtype"]),
                # TODO uncomment back when image encoding will be added
                # encoding_format=inp["encoding_format"],
                max_shape=tuple(inp["max_shape"]),
                chunks=inp["chunks"],
                compressor=_get_compressor(inp),
            )
        elif inp["type"] == "Mask":
            return Mask(
                shape=tuple(inp["shape"]),
                max_shape=tuple(inp["max_shape"]),
                chunks=inp["chunks"],
                compressor=_get_compressor(inp),
            )
        elif inp["type"] == "Polygon":
            return Polygon(
                shape=tuple(inp["shape"]),
                max_shape=tuple(inp["max_shape"]),
                dtype=deserialize(inp["dtype"]),
                chunks=inp["chunks"],
                compressor=_get_compressor(inp),
            )
        elif inp["type"] == "Segmentation":
            class_labels = deserialize(inp["class_labels"])
            if hasattr(class_labels, "_num_classes"):
                return Segmentation(
                    shape=tuple(inp["shape"]),
                    dtype=deserialize(inp["dtype"]),
                    num_classes=class_labels._num_classes,
                    max_shape=tuple(inp["max_shape"]),
                    chunks=inp["chunks"],
                    compressor=_get_compressor(inp),
                )
            else:
                return Segmentation(
                    shape=tuple(inp["shape"]),
                    dtype=deserialize(inp["dtype"]),
                    names=class_labels.names,
                    max_shape=tuple(inp["max_shape"]),
                    chunks=inp["chunks"],
                    compressor=_get_compressor(inp),
                )
        elif inp["type"] == "Sequence":
            return Sequence(
                shape=tuple(inp["shape"]),
                dtype=deserialize(inp["dtype"]),
                max_shape=tuple(inp["max_shape"]),
                chunks=inp["chunks"],
                compressor=_get_compressor(inp),
            )
        elif inp["type"] == "Tensor":
            return Tensor(
                tuple(inp["shape"]),
                deserialize(inp["dtype"]),
                max_shape=tuple(inp["max_shape"]),
                chunks=inp["chunks"],
                compressor=_get_compressor(inp),
            )
        elif inp["type"] == "Text":
            return Text(
                tuple(inp["shape"]),
                deserialize(inp["dtype"]),
                max_shape=tuple(inp["max_shape"]),
                chunks=inp["chunks"],
                compressor=_get_compressor(inp),
            )
        elif inp["type"] == "Video":
            return Video(
                shape=tuple(inp["shape"]),
                dtype=deserialize(inp["dtype"]),
                # TODO uncomment back when image encoding will be added
                # encoding_format=inp["encoding_format"],
                max_shape=tuple(inp["max_shape"]),
                chunks=inp["chunks"],
                compressor=_get_compressor(inp),
            )
    else:
        return inp
예제 #12
0
class Segmentation(Tensor):
    """`HubSchema` for segmentation"""
    def __init__(
        self,
        shape: Tuple[int, ...] = None,
        dtype: str = None,
        num_classes: int = None,
        names: Tuple[str] = None,
        names_file: str = None,
        max_shape: Tuple[int, ...] = None,
        chunks=None,
        compressor="lz4",
    ):
        """Constructs a Segmentation HubSchema.
        Also constructs ClassLabel HubSchema for Segmentation classes.

        Parameters
        ----------
        shape: tuple of ints or None
            Shape in format (height, width, 1)
        dtype: str
            dtype of segmentation array: `uint16` or `uint8`
        num_classes: int
            Number of classes. All labels must be < num_classes.
        names: `list<str>`
            string names for the integer classes. The order in which the names are provided is kept.
        names_file: str
            Path to a file with names for the integer classes, one per line.
        max_shape : tuple[int]
            Maximum shape of tensor shape if tensor is dynamic
        chunks : tuple[int] | True
            Describes how to split tensor dimensions into chunks (files) to store them efficiently.
            It is anticipated that each file should be ~16MB.
            Sample Count is also in the list of tensor's dimensions (first dimension)
            If default value is chosen, automatically detects how to split into chunks
        """
        super().__init__(shape, dtype, max_shape=max_shape, chunks=chunks)
        self.class_labels = ClassLabel(
            num_classes=num_classes,
            names=names,
            names_file=names_file,
            chunks=chunks,
            compressor="lz4",
        )

    def get_segmentation_classes(self):
        """Get classes of the segmentation mask"""
        class_indices = np.unique(self)
        return [self.class_labels.int2str(value) for value in class_indices]

    def get_attr_dict(self):
        """Return class attributes."""
        return self.__dict__

    def __str__(self):
        out = super().__str__()
        out = "Segmentation" + out[6:-1]
        out = (out + ", names=" + str(self.class_labels._names)
               if self.class_labels._names is not None else out)
        out = (out + ", num_classes=" + str(self.class_labels._num_classes)
               if self.class_labels._num_classes is not None else out)
        out += ")"
        return out

    def __repr__(self):
        return self.__str__()