def test_check_label_name(): my_schema = {"label": ClassLabel(names=["red", "green", "blue"])} ds = Dataset("./data/test/dataset2", shape=(5,), mode="w", schema=my_schema) ds["label", 0] = 1 ds["label", 1] = 2 ds["label", 0] = 1 ds["label", 1] = 2 ds["label", 2] = 0 assert ds.compute(label_name=True).tolist() == [ {"label": "green"}, {"label": "blue"}, {"label": "red"}, {"label": "red"}, {"label": "red"}, ] assert ds.compute().tolist() == [ {"label": 1}, {"label": 2}, {"label": 0}, {"label": 0}, {"label": 0}, ] assert ds[1].compute(label_name=True) == {"label": "blue"} assert ds[1].compute() == {"label": 2} assert ds[1:3].compute(label_name=True).tolist() == [ {"label": "blue"}, {"label": "red"}, ] assert ds[1:3].compute().tolist() == [{"label": 2}, {"label": 0}]
def test_serialize_deserialize(): t = Tensor( shape=(100, 200), dtype={ "image": Image(shape=(300, 400, 3), dtype="uint8"), "label": Tensor( shape=(5000, ), dtype={ "first": { "a": "<U20", "b": "uint32", "c": ClassLabel(num_classes=3), }, "second": "float64", }, ), "bbox": BBox(dtype="float64"), "audio": Audio(shape=(120, ), dtype="uint32"), "mask": Mask(shape=(5, 8, 1)), "polygon": Polygon(shape=(16, 2)), "segmentation1": Segmentation(shape=(5, 9, 1), dtype="uint8", num_classes=5), "segmentation2": Segmentation(shape=(5, 9, 1), dtype="uint8", names=("apple", "orange", "pineapple")), "sequence": Sequence(dtype=Tensor(shape=(None, None), max_shape=(100, 100), dtype="uint8"), ), "text": Text((None, ), max_shape=(10, )), "video": Video((100, 100, 3, 10)), }, ) original_result = tuple(t._flatten()) original_paths = [r.path for r in original_result] original_shapes = [r.shape for r in original_result] origanal_dtypes = [str(r.dtype) for r in original_result] serialize_t = serialize(t) deserialize_t = deserialize(serialize_t) result = tuple(deserialize_t._flatten()) paths = [r.path for r in result] shapes = [r.shape for r in result] dtypes = [str(r.dtype) for r in result] assert paths == original_paths assert shapes == original_shapes assert dtypes == origanal_dtypes
def __init__( self, shape: Tuple[int, ...] = None, dtype: str = None, num_classes: int = None, names: Tuple[str] = None, names_file: str = None, max_shape: Tuple[int, ...] = None, chunks=None, compressor="lz4", ): """Constructs a Segmentation HubSchema. Also constructs ClassLabel HubSchema for Segmentation classes. Parameters ---------- shape: tuple of ints or None Shape in format (height, width, 1) dtype: str dtype of segmentation array: `uint16` or `uint8` num_classes: int Number of classes. All labels must be < num_classes. names: `list<str>` string names for the integer classes. The order in which the names are provided is kept. names_file: str Path to a file with names for the integer classes, one per line. max_shape : tuple[int] Maximum shape of tensor shape if tensor is dynamic chunks : tuple[int] | True Describes how to split tensor dimensions into chunks (files) to store them efficiently. It is anticipated that each file should be ~16MB. Sample Count is also in the list of tensor's dimensions (first dimension) If default value is chosen, automatically detects how to split into chunks """ super().__init__(shape, dtype, max_shape=max_shape, chunks=chunks) self.class_labels = ClassLabel( num_classes=num_classes, names=names, names_file=names_file, chunks=chunks, compressor="lz4", )
def test_dataset_filtering_3(): schema = { "img": Image((None, None, 3), max_shape=(100, 100, 3)), "cl": ClassLabel(names=["cat", "dog", "horse"]), } ds = Dataset("./test/filtering_3", shape=(100, ), schema=schema, mode="w") for i in range(100): ds["cl", i] = 0 if i < 10 else 1 ds["img", i] = i * np.ones((5, 6, 3)) ds_filtered = ds.filter({"cl": 0}) assert (ds_filtered[3:8, "cl"].compute() == np.zeros((5, ))).all()
def test_classlabel_repr(): cl1 = ClassLabel(num_classes=5) cl2 = ClassLabel(names=["apple", "orange", "banana"]) text1 = "ClassLabel(shape=(), dtype='int64', num_classes=5)" text2 = "ClassLabel(shape=(), dtype='int64', names=['apple', 'orange', 'banana'], num_classes=3)" assert cl1.__repr__() == text1 assert cl2.__repr__() == text2
def test_dataset_filtering_2(): schema = { "img": Image((None, None, 3), max_shape=(100, 100, 3)), "cl": ClassLabel(names=["cat", "dog", "horse"]), } ds = Dataset("./test/filtering_3", shape=(100, ), schema=schema, mode="w") for i in range(100): ds["cl", i] = 0 if i % 5 == 0 else 1 ds["img", i] = i * np.ones((5, 6, 3)) ds["cl", 4] = 2 ds_filtered = ds.filter({"cl": 0}) assert ds_filtered.indexes == [5 * i for i in range(20)] with pytest.raises(ValueError): ds_filtered["img"].compute() ds_filtered_2 = ds.filter({"cl": 2}) assert (ds_filtered_2["img"].compute() == 4 * np.ones((1, 5, 6, 3))).all() for item in ds_filtered_2: assert (item["img"].compute() == 4 * np.ones((5, 6, 3))).all() assert item["cl"].compute() == 2
def test_class_label(): bel1 = ClassLabel(num_classes=4) bel2 = ClassLabel(names=["alpha", "beta", "gamma"]) ClassLabel(names_file=names_file) assert bel1.names == ["0", "1", "2", "3"] assert bel2.names == ["alpha", "beta", "gamma"] assert bel1.str2int("1") == 1 assert bel2.str2int("gamma") == 2 assert bel1.int2str( 2) is None # FIXME This is a bug, should raise an error assert bel2.int2str(0) == "alpha" assert bel1.num_classes == 4 assert bel2.num_classes == 3 bel1.get_attr_dict()
def deserialize(inp): if isinstance(inp, dict): if inp["type"] == "Audio": return Audio( shape=tuple(inp["shape"]), dtype=deserialize(inp["dtype"]), file_format=inp["file_format"], sample_rate=inp["sample_rate"], max_shape=tuple(inp["max_shape"]), chunks=inp["chunks"], compressor=_get_compressor(inp), ) elif inp["type"] == "BBox": return BBox( shape=tuple(inp["shape"]), dtype=deserialize(inp["dtype"]), chunks=inp["chunks"], compressor=_get_compressor(inp), max_shape=tuple(inp["max_shape"]), ) elif inp["type"] == "ClassLabel": if inp["_names"] is not None: return ClassLabel( shape=tuple(inp["shape"]), dtype=deserialize(inp["dtype"]), names=inp["_names"], chunks=inp["chunks"], compressor=_get_compressor(inp), max_shape=tuple(inp["max_shape"]), ) else: return ClassLabel( shape=tuple(inp["shape"]), dtype=deserialize(inp["dtype"]), num_classes=inp["_num_classes"], chunks=inp["chunks"], compressor=_get_compressor(inp), max_shape=tuple(inp["max_shape"]), ) elif inp["type"] == "SchemaDict" or inp["type"] == "FeatureDict": d = {} for k, v in inp["items"].items(): d[k] = deserialize(v) return SchemaDict(d) elif inp["type"] == "Image": return Image( shape=tuple(inp["shape"]), dtype=deserialize(inp["dtype"]), max_shape=tuple(inp["max_shape"]), chunks=inp["chunks"], compressor=_get_compressor(inp), ) elif inp["type"] == "Mask": return Mask( shape=tuple(inp["shape"]), max_shape=tuple(inp["max_shape"]), chunks=inp["chunks"], compressor=_get_compressor(inp), ) elif inp["type"] == "Polygon": return Polygon( shape=tuple(inp["shape"]), max_shape=tuple(inp["max_shape"]), dtype=deserialize(inp["dtype"]), chunks=inp["chunks"], compressor=_get_compressor(inp), ) elif inp["type"] == "Primitive": return Primitive( dtype=deserialize(inp["dtype"]), chunks=inp["chunks"], compressor=_get_compressor(inp), ) elif inp["type"] == "Segmentation": class_labels = deserialize(inp["class_labels"]) if class_labels._names is not None: return Segmentation( shape=tuple(inp["shape"]), dtype=deserialize(inp["dtype"]), names=class_labels._names, max_shape=tuple(inp["max_shape"]), chunks=inp["chunks"], compressor=_get_compressor(inp), ) else: return Segmentation( shape=tuple(inp["shape"]), dtype=deserialize(inp["dtype"]), num_classes=class_labels._num_classes, max_shape=tuple(inp["max_shape"]), chunks=inp["chunks"], compressor=_get_compressor(inp), ) elif inp["type"] == "Sequence": return Sequence( shape=tuple(inp["shape"]), dtype=deserialize(inp["dtype"]), max_shape=tuple(inp["max_shape"]), chunks=inp["chunks"], compressor=_get_compressor(inp), ) elif inp["type"] == "Tensor": return Tensor( tuple(inp["shape"]), deserialize(inp["dtype"]), max_shape=tuple(inp["max_shape"]), chunks=inp["chunks"], compressor=_get_compressor(inp), ) elif inp["type"] == "Text": return Text( tuple(inp["shape"]), deserialize(inp["dtype"]), max_shape=tuple(inp["max_shape"]), chunks=inp["chunks"], compressor=_get_compressor(inp), ) elif inp["type"] == "Video": return Video( shape=tuple(inp["shape"]), dtype=deserialize(inp["dtype"]), max_shape=tuple(inp["max_shape"]), chunks=inp["chunks"], compressor=_get_compressor(inp), ) else: return inp
def test_class_label(): cl1 = ClassLabel(num_classes=5) cl2 = ClassLabel(names=["apple", "orange", "banana"]) with pytest.raises(ValueError): cl3 = ClassLabel(names=["apple", "orange", "banana", "apple"]) with pytest.raises(ValueError): cl4 = ClassLabel(names=["apple", "orange", "banana", "apple"], num_classes=2) cl5 = ClassLabel() cl6 = ClassLabel(names_file="./hub/schema/tests/class_label_names.txt") assert cl1.names == ["0", "1", "2", "3", "4"] assert cl2.names == ["apple", "orange", "banana"] assert cl6.names == [ "alpha", "beta", "gamma", ] assert cl1.num_classes == 5 assert cl2.num_classes == 3 assert cl1.str2int("3") == 3 assert cl2.str2int("orange") == 1 assert cl1.int2str(4) == "4" assert cl2.int2str(2) == "banana" with pytest.raises(KeyError): cl2.str2int("2") with pytest.raises(ValueError): cl1.str2int("8") with pytest.raises(ValueError): cl1.str2int("abc") with pytest.raises(ValueError): cl1.names = ["ab", "cd", "ef", "gh"] with pytest.raises(ValueError): cl2.names = ["ab", "cd", "ef", "gh"]
def test_class_label_2(): cl1 = ClassLabel(names=["apple", "banana", "cat"]) cl2 = ClassLabel((None, ), (10, ), names=["apple", "banana", "cat"]) cl3 = ClassLabel((3, ), names=["apple", "banana", "cat"]) my_schema = {"cl1": cl1, "cl2": cl2, "cl3": cl3} ds = Dataset("./data/cl_2d_3d", schema=my_schema, shape=(10), mode="w") ds["cl1", 0] = cl1.str2int("cat") ds["cl1", 1] = cl1.str2int("apple") ds["cl1", 2] = cl1.str2int("apple") ds["cl1", 3:5] = [cl1.str2int("banana"), cl1.str2int("banana")] assert ds["cl1", 1].compute(True) == "apple" assert ds["cl1", 0:3].compute(True) == ["cat", "apple", "apple"] assert ds["cl1", 3:5].compute(True) == ["banana", "banana"] ds["cl2", 0] = np.array( [cl2.str2int("cat"), cl2.str2int("cat"), cl2.str2int("apple")]) ds["cl2", 1] = np.array([cl2.str2int("apple"), cl2.str2int("banana")]) ds["cl2", 2] = np.array([ cl2.str2int("cat"), cl2.str2int("apple"), cl2.str2int("banana"), cl2.str2int("apple"), cl2.str2int("banana"), ]) ds["cl2", 3] = np.array([cl2.str2int("cat")]) assert ds["cl2", 0].compute(True) == ["cat", "cat", "apple"] assert ds["cl2", 1].compute(True) == ["apple", "banana"] assert ds["cl2", 2].compute(True) == [ "cat", "apple", "banana", "apple", "banana" ] assert ds["cl2", 3].compute(True) == ["cat"] ds["cl3", 0] = np.array( [cl3.str2int("apple"), cl3.str2int("apple"), cl3.str2int("apple")]) ds["cl3", 1] = np.array( [cl3.str2int("banana"), cl3.str2int("banana"), cl3.str2int("banana")]) ds["cl3", 2] = np.array( [cl3.str2int("cat"), cl3.str2int("cat"), cl3.str2int("cat")]) assert ds["cl3", 0].compute(True) == ["apple", "apple", "apple"] assert ds["cl3", 1].compute(True) == ["banana", "banana", "banana"] assert ds["cl3", 2].compute(True) == ["cat", "cat", "cat"] assert ds["cl3", 0:3].compute(True) == [ ["apple", "apple", "apple"], ["banana", "banana", "banana"], ["cat", "cat", "cat"], ]
def deserialize(inp): if isinstance(inp, dict): if inp["type"] == "Audio": return Audio( shape=tuple(inp["shape"]), dtype=deserialize(inp["dtype"]), file_format=inp["file_format"], sample_rate=inp["sample_rate"], max_shape=tuple(inp["max_shape"]), chunks=inp["chunks"], compressor=_get_compressor(inp), ) elif inp["type"] == "BBox": return BBox( dtype=deserialize(inp["dtype"]), chunks=inp["chunks"], compressor=_get_compressor(inp), ) elif inp["type"] == "ClassLabel": if "_num_classes" in inp.keys(): return ClassLabel( num_classes=inp["_num_classes"], chunks=inp["chunks"], compressor=_get_compressor(inp), ) else: return ClassLabel( names=inp["names"], chunks=inp["chunks"], compressor=_get_compressor(inp), ) elif inp["type"] == "SchemaDict" or inp["type"] == "FeatureDict": d = {} for k, v in inp["items"].items(): d[k] = deserialize(v) return SchemaDict(d) elif inp["type"] == "Image": return Image( shape=tuple(inp["shape"]), dtype=deserialize(inp["dtype"]), # TODO uncomment back when image encoding will be added # encoding_format=inp["encoding_format"], max_shape=tuple(inp["max_shape"]), chunks=inp["chunks"], compressor=_get_compressor(inp), ) elif inp["type"] == "Mask": return Mask( shape=tuple(inp["shape"]), max_shape=tuple(inp["max_shape"]), chunks=inp["chunks"], compressor=_get_compressor(inp), ) elif inp["type"] == "Polygon": return Polygon( shape=tuple(inp["shape"]), max_shape=tuple(inp["max_shape"]), dtype=deserialize(inp["dtype"]), chunks=inp["chunks"], compressor=_get_compressor(inp), ) elif inp["type"] == "Segmentation": class_labels = deserialize(inp["class_labels"]) if hasattr(class_labels, "_num_classes"): return Segmentation( shape=tuple(inp["shape"]), dtype=deserialize(inp["dtype"]), num_classes=class_labels._num_classes, max_shape=tuple(inp["max_shape"]), chunks=inp["chunks"], compressor=_get_compressor(inp), ) else: return Segmentation( shape=tuple(inp["shape"]), dtype=deserialize(inp["dtype"]), names=class_labels.names, max_shape=tuple(inp["max_shape"]), chunks=inp["chunks"], compressor=_get_compressor(inp), ) elif inp["type"] == "Sequence": return Sequence( shape=tuple(inp["shape"]), dtype=deserialize(inp["dtype"]), max_shape=tuple(inp["max_shape"]), chunks=inp["chunks"], compressor=_get_compressor(inp), ) elif inp["type"] == "Tensor": return Tensor( tuple(inp["shape"]), deserialize(inp["dtype"]), max_shape=tuple(inp["max_shape"]), chunks=inp["chunks"], compressor=_get_compressor(inp), ) elif inp["type"] == "Text": return Text( tuple(inp["shape"]), deserialize(inp["dtype"]), max_shape=tuple(inp["max_shape"]), chunks=inp["chunks"], compressor=_get_compressor(inp), ) elif inp["type"] == "Video": return Video( shape=tuple(inp["shape"]), dtype=deserialize(inp["dtype"]), # TODO uncomment back when image encoding will be added # encoding_format=inp["encoding_format"], max_shape=tuple(inp["max_shape"]), chunks=inp["chunks"], compressor=_get_compressor(inp), ) else: return inp
class Segmentation(Tensor): """`HubSchema` for segmentation""" def __init__( self, shape: Tuple[int, ...] = None, dtype: str = None, num_classes: int = None, names: Tuple[str] = None, names_file: str = None, max_shape: Tuple[int, ...] = None, chunks=None, compressor="lz4", ): """Constructs a Segmentation HubSchema. Also constructs ClassLabel HubSchema for Segmentation classes. Parameters ---------- shape: tuple of ints or None Shape in format (height, width, 1) dtype: str dtype of segmentation array: `uint16` or `uint8` num_classes: int Number of classes. All labels must be < num_classes. names: `list<str>` string names for the integer classes. The order in which the names are provided is kept. names_file: str Path to a file with names for the integer classes, one per line. max_shape : tuple[int] Maximum shape of tensor shape if tensor is dynamic chunks : tuple[int] | True Describes how to split tensor dimensions into chunks (files) to store them efficiently. It is anticipated that each file should be ~16MB. Sample Count is also in the list of tensor's dimensions (first dimension) If default value is chosen, automatically detects how to split into chunks """ super().__init__(shape, dtype, max_shape=max_shape, chunks=chunks) self.class_labels = ClassLabel( num_classes=num_classes, names=names, names_file=names_file, chunks=chunks, compressor="lz4", ) def get_segmentation_classes(self): """Get classes of the segmentation mask""" class_indices = np.unique(self) return [self.class_labels.int2str(value) for value in class_indices] def get_attr_dict(self): """Return class attributes.""" return self.__dict__ def __str__(self): out = super().__str__() out = "Segmentation" + out[6:-1] out = (out + ", names=" + str(self.class_labels._names) if self.class_labels._names is not None else out) out = (out + ", num_classes=" + str(self.class_labels._num_classes) if self.class_labels._num_classes is not None else out) out += ")" return out def __repr__(self): return self.__str__()