def CACD(path: str) -> Dataset: """`Cross-Age Celebrity Dataset (CACD) <https://bcsiriuschen.github.io/CARC/>`_ dataset. The file structure should be like:: <path> CACD2000/ 14_Aaron_Johnson_0001.jpg ... celebrity2000.mat Arguments: path: The root directory of the dataset. Returns: Loaded :class:`~tensorbay.dataset.dataset.Dataset` instance. """ root_path = os.path.abspath(os.path.expanduser(path)) dataset = Dataset(DATASET_NAME) dataset.catalog.classification = _get_subcatalog() segment = dataset.create_segment() image_files = glob(os.path.join(root_path, "CACD2000", "*.jpg")) labels_map = _get_labels_map(os.path.join(root_path, "celebrity2000.mat")) for image in image_files: category, attribute = labels_map[os.path.basename(image)] image_data = Data(image) image_data.label.classification = Classification(category, attribute) segment.append(image_data) return dataset
def test_create_and_upload_dataset_with_config(self, accesskey, url, tmp_path): gas_client = GAS(access_key=accesskey, url=url) dataset_name = get_dataset_name() try: gas_client.get_auth_storage_config(name=_LOCAL_CONFIG_NAME) except ResourceNotExistError: pytest.skip(f"skip this case because there's no {_LOCAL_CONFIG_NAME} config") gas_client.create_dataset(dataset_name, config_name=_LOCAL_CONFIG_NAME) dataset = Dataset(name=dataset_name) segment = dataset.create_segment("Segment1") # When uploading label, upload catalog first. dataset._catalog = Catalog.loads(CATALOG) path = tmp_path / "sub" path.mkdir() for i in range(5): local_path = path / f"hello{i}.txt" local_path.write_text("CONTENT") data = Data(local_path=str(local_path)) data.label = Label.loads(LABEL) segment.append(data) dataset_client = gas_client.upload_dataset(dataset) assert dataset_client.get_catalog() segment1 = Segment("Segment1", client=dataset_client) assert len(segment1) == 5 for i in range(5): assert segment1[i].path == f"hello{i}.txt" assert segment1[i].label gas_client.delete_dataset(dataset_name)
def test_copy_data_between_datasets(self, accesskey, url, tmp_path): gas_client = GAS(access_key=accesskey, url=url) dataset_name_1 = get_dataset_name() gas_client.create_dataset(dataset_name_1) dataset_1 = Dataset(name=dataset_name_1) segment_1 = dataset_1.create_segment("Segment1") dataset_1._catalog = Catalog.loads(CATALOG) path = tmp_path / "sub" path.mkdir() for i in range(10): local_path = path / f"hello{i}.txt" local_path.write_text("CONTENT") data = Data(local_path=str(local_path)) data.label = Label.loads(LABEL) segment_1.append(data) dataset_client_1 = gas_client.upload_dataset(dataset_1) dataset_client_1.commit("upload data") segment_client_1 = dataset_client_1.get_segment("Segment1") dataset_name_2 = dataset_name_1 + "_2" dataset_client_2 = gas_client.create_dataset(dataset_name_2) dataset_client_2.create_draft("draft_2") dataset_client_2.create_segment("Segment1") segment_client_2 = dataset_client_2.get_segment("Segment1") segment_client_2.copy_data("hello0.txt", "hello0.txt", source_client=segment_client_1) segment2 = Segment("Segment1", client=dataset_client_2) assert segment2[0].path == "hello0.txt" assert segment2[0].label gas_client.delete_dataset(dataset_name_1) gas_client.delete_dataset(dataset_name_2)
def test_move_segment(self, accesskey, url, tmp_path): gas_client = GAS(access_key=accesskey, url=url) dataset_name = get_dataset_name() gas_client.create_dataset(dataset_name) dataset = Dataset(name=dataset_name) segment = dataset.create_segment("Segment1") dataset._catalog = Catalog.loads(CATALOG) path = tmp_path / "sub" path.mkdir() for i in range(10): local_path = path / f"hello{i}.txt" local_path.write_text("CONTENT") data = Data(local_path=str(local_path)) data.label = Label.loads(LABEL) segment.append(data) dataset_client = gas_client.upload_dataset(dataset) segment_client = dataset_client.move_segment("Segment1", "Segment2") assert segment_client.name == "Segment2" with pytest.raises(InvalidParamsError): dataset_client.move_segment("Segment1", "Segment3", strategy="push") segment2 = Segment("Segment2", client=dataset_client) assert segment2[0].path == "hello0.txt" assert segment2[0].path == segment[0].target_remote_path assert segment2[0].label gas_client.delete_dataset(dataset_name)
def test_copy_segment_skip(self, accesskey, url, tmp_path): gas_client = GAS(access_key=accesskey, url=url) dataset_name = get_dataset_name() gas_client.create_dataset(dataset_name) dataset = Dataset(name=dataset_name) segment1 = dataset.create_segment("Segment1") dataset._catalog = Catalog.loads(CATALOG) path = tmp_path / "sub" path.mkdir() for i in range(10): local_path = path / f"hello{i}.txt" local_path.write_text("CONTENT") data = Data(local_path=str(local_path)) data.label = Label.loads(LABEL) segment1.append(data) segment2 = dataset.create_segment("Segment2") for i in range(10, 20): local_path = path / f"hello{i}.txt" local_path.write_text("CONTENT") data = Data(local_path=str(local_path)) data.label = Label.loads(LABEL) segment2.append(data) dataset_client = gas_client.upload_dataset(dataset) dataset_client.copy_segment("Segment1", "Segment2", strategy="skip") segment_copied = Segment("Segment2", client=dataset_client) assert segment_copied[0].path == "hello10.txt" assert segment_copied[0].path == segment2[0].target_remote_path assert segment_copied[0].label gas_client.delete_dataset(dataset_name)
def test_copy_data(self, accesskey, url, tmp_path): gas_client = GAS(access_key=accesskey, url=url) dataset_name = get_dataset_name() gas_client.create_dataset(dataset_name) dataset = Dataset(name=dataset_name) segment = dataset.create_segment("Segment1") dataset._catalog = Catalog.loads(CATALOG) path = tmp_path / "sub" path.mkdir() for i in range(10): local_path = path / f"hello{i}.txt" local_path.write_text("CONTENT") data = Data(local_path=str(local_path)) data.label = Label.loads(LABEL) segment.append(data) dataset_client = gas_client.upload_dataset(dataset) segment_client = dataset_client.get_segment("Segment1") segment_client.copy_data("hello0.txt", "goodbye0.txt") segment_client.copy_data("hello1.txt", "hello10.txt") with pytest.raises(InvalidParamsError): segment_client.copy_data("hello2.txt", "see_you.txt", strategy="push") segment2 = Segment("Segment1", client=dataset_client) assert segment2[0].path == "goodbye0.txt" assert segment2[3].path == "hello10.txt" assert segment2[1].label gas_client.delete_dataset(dataset_name)
def test_move_data_skip(self, accesskey, url, tmp_path): gas_client = GAS(access_key=accesskey, url=url) dataset_name = get_dataset_name() gas_client.create_dataset(dataset_name) dataset = Dataset(name=dataset_name) segment = dataset.create_segment("Segment1") dataset._catalog = Catalog.loads(CATALOG) path = tmp_path / "sub" path.mkdir() for i in range(10): local_path = path / f"hello{i}.txt" local_path.write_text(f"CONTENT_{i}") data = Data(local_path=str(local_path)) data.label = Label.loads(LABEL) segment.append(data) dataset_client = gas_client.upload_dataset(dataset) segment_client = dataset_client.get_segment("Segment1") segment_client.move_data("hello0.txt", "hello1.txt", strategy="skip") segment_moved = Segment("Segment1", client=dataset_client) assert segment_moved[0].path == "hello1.txt" assert segment_moved[0].open().read() == b"CONTENT_1" gas_client.delete_dataset(dataset_name)
def test_copy_segment_abort(self, accesskey, url, tmp_path): gas_client = GAS(access_key=accesskey, url=url) dataset_name = get_dataset_name() gas_client.create_dataset(dataset_name) dataset = Dataset(name=dataset_name) segment1 = dataset.create_segment("Segment1") dataset._catalog = Catalog.loads(CATALOG) path = tmp_path / "sub" path.mkdir() for i in range(10): local_path = path / f"hello{i}.txt" local_path.write_text("CONTENT") data = Data(local_path=str(local_path)) data.label = Label.loads(LABEL) segment1.append(data) segment2 = dataset.create_segment("Segment2") for i in range(10): local_path = path / f"hello{i}.txt" local_path.write_text("CONTENT") data = Data(local_path=str(local_path)) data.label = Label.loads(LABEL) segment2.append(data) dataset_client = gas_client.upload_dataset(dataset) with pytest.raises(InternalServerError): dataset_client.copy_segment("Segment1", "Segment2") gas_client.delete_dataset(dataset_name)
def test_upload_dataset_to_given_draft(self, accesskey, url, tmp_path): gas_client = GAS(access_key=accesskey, url=url) dataset_name = get_random_dataset_name() dataset_client_1 = gas_client.create_dataset(dataset_name) draft_number = dataset_client_1.create_draft("test") dataset = Dataset(name=dataset_name) segment = dataset.create_segment("Segment1") path = tmp_path / "sub" path.mkdir() for i in range(10): local_path = path / f"hello{i}.txt" local_path.write_text("CONTENT") segment.append(Data(local_path=str(local_path))) dataset_client_2 = gas_client.upload_dataset(dataset, draft_number=draft_number) segment1 = Segment("Segment1", client=dataset_client_2) assert len(segment1) == 10 assert segment1[0].path == "hello0.txt" assert not segment1[0].label with pytest.raises(GASResponseError): gas_client.upload_dataset(dataset, draft_number=draft_number + 1) gas_client.delete_dataset(dataset_name)
def FSDD(path: str) -> Dataset: """`Free Spoken Digit <https://github.com/Jakobovski/free-spoken-digit-dataset>`_ dataset. The file structure should be like:: <path> recordings/ 0_george_0.wav 0_george_1.wav ... Arguments: path: The root directory of the dataset. Returns: Loaded :class:`~tensorbay.dataset.dataset.Dataset` instance. """ label_map = {} for key, value in _METADATA.items(): attributes = {"name": key} attributes.update(value) label_map[key] = attributes dataset = Dataset(DATASET_NAME) dataset.load_catalog( os.path.join(os.path.dirname(__file__), "catalog.json")) segment = dataset.create_segment() audio_paths = glob(os.path.join(path, "recordings", "*.wav")) for audio_path in audio_paths: category, name = os.path.basename(audio_path).split("_")[:2] data = Data(audio_path) data.label.classification = Classification(category, label_map[name]) segment.append(data) return dataset
def test_upload_dataset_after_commit(self, accesskey, url, tmp_path): gas_client = GAS(access_key=accesskey, url=url) dataset_name = get_dataset_name() gas_client.create_dataset(dataset_name) dataset = Dataset(name=dataset_name) dataset._catalog = Catalog.loads(CATALOG) dataset.notes.is_continuous = True segment = dataset.create_segment("Segment1") path = tmp_path / "sub" path.mkdir() for i in range(10): local_path = path / f"hello{i}.txt" local_path.write_text("CONTENT") data = Data(local_path=str(local_path)) data.label = Label.loads(LABEL) segment.append(data) dataset_client = gas_client.upload_dataset(dataset) dataset_client.commit("test") dataset_remote = Dataset(name=dataset_name, gas=gas_client) assert dataset_remote.notes.is_continuous == dataset.notes.is_continuous assert dataset_remote.catalog == dataset.catalog segment_remote = dataset_remote[0] assert len(segment_remote) == len(segment) for remote_data, data in zip(segment_remote, segment): assert remote_data.path == data.target_remote_path assert remote_data.label == data.label gas_client.delete_dataset(dataset_name)
def CCPDGreen(path: str) -> Dataset: """`CCPDGreen <https://github.com/detectRecog/CCPD>`_ dataset. The file structure should be like:: <path> ccpd_green/ train/ test/ val/ Arguments: path: The root directory of the dataset. Returns: Loaded :class: `~tensorbay.dataset.dataset.Dataset` instance. """ root_path = os.path.join(os.path.abspath(os.path.expanduser(path)), "ccpd_green") dataset = Dataset(DATASET_NAME_CCPDGREEN) dataset.load_catalog(os.path.join(os.path.dirname(__file__), "catalog.json")) for segment_name in _CCPDGREEN_SEGMENTS: segment = dataset.create_segment(segment_name) for image_path in glob(os.path.join(root_path, segment_name, "*.jpg")): data = Data(image_path) data.label.polygon = _get_polygons(image_path) segment.append(data) return dataset
def PASCALContext(mask_path: str, image_path: str) -> Dataset: """`PASCALContext <https://cs.stanford.edu/~roozbeh/pascal-context/>`_ dataset. The file structure should be like:: <mask_path> <image_name>.png ... <image_path> <image_name>.jpg ... Arguments: mask_path: The root directory of the dataset mask. image_path: The root directory of the dataset image. Returns: Loaded :class: `~tensorbay.dataset.dataset.Dataset` instance. """ root_mask_path = os.path.abspath(os.path.expanduser(mask_path)) root_image_path = os.path.abspath(os.path.expanduser(image_path)) dataset = Dataset(DATASET_NAME) dataset.load_catalog(os.path.join(os.path.dirname(__file__), "catalog.json")) segment = dataset.create_segment("trainval") for mask_filename in glob(os.path.join(root_mask_path, "*.png")): stem = os.path.splitext(os.path.basename(mask_filename))[0] data = Data(os.path.join(root_image_path, f"{stem}.jpg")) data.label.semantic_mask = SemanticMask(mask_filename) segment.append(data) return dataset
def test_upload_dataset_only_with_file(self, accesskey, url, tmp_path): gas_client = GAS(access_key=accesskey, url=url) dataset_name = get_dataset_name() gas_client.create_dataset(dataset_name) dataset = Dataset(name=dataset_name) dataset.notes.is_continuous = True segment = dataset.create_segment("Segment1") path = tmp_path / "sub" path.mkdir() for i in range(10): local_path = path / f"hello{i}.txt" local_path.write_text("CONTENT") segment.append(Data(local_path=str(local_path))) dataset_client = gas_client.upload_dataset(dataset) assert dataset_client.status.branch_name == DEFAULT_BRANCH assert dataset_client.status.draft_number assert not dataset_client.status.commit_id assert dataset_client.get_notes().is_continuous is True assert not dataset_client.get_catalog() segment1 = Segment("Segment1", client=dataset_client) assert len(segment1) == 10 assert segment1[0].path == "hello0.txt" assert not segment1[0].label gas_client.delete_dataset(dataset_name)
def CarConnection(path: str) -> Dataset: """`Car Connection Picture <https://github.com/nicolas-gervais\ /predicting-car-price-from-scraped-data/tree/master/picture-scraper>`_ dataset. The file structure should be like:: <path> <imagename>.jpg ... Arguments: path: The root directory of the dataset. Returns: Loaded :class:`~tensorbay.dataset.dataset.Dataset` instance. """ root_path = os.path.abspath(os.path.expanduser(path)) dataset = Dataset(DATASET_NAME) dataset.load_catalog( os.path.join(os.path.dirname(__file__), "catalog.json")) segment = dataset.create_segment() image_paths = glob(os.path.join(root_path, "*.jpg")) keys = dataset.catalog.classification.attributes.keys() for image_path in image_paths: data = Data(image_path) basename = os.path.basename(image_path) label = _extract_label_from_basename(keys, basename) data.label.classification = label segment.append(data) return dataset
def test_upload_dataset_with_label(self, accesskey, url, tmp_path): gas_client = GAS(access_key=accesskey, url=url) dataset_name = get_dataset_name() gas_client.create_dataset(dataset_name) dataset = Dataset(name=dataset_name) segment = dataset.create_segment("Segment1") # When uploading label, upload catalog first. dataset._catalog = Catalog.loads(CATALOG) path = tmp_path / "sub" path.mkdir() for i in range(10): local_path = path / f"hello{i}.txt" local_path.write_text("CONTENT") data = Data(local_path=str(local_path)) data.label = Label.loads(LABEL) segment.append(data) dataset_client = gas_client.upload_dataset(dataset) assert dataset_client.get_catalog() segment1 = Segment("Segment1", client=dataset_client) assert len(segment1) == 10 assert segment1[0].path == "hello0.txt" assert segment1[0].label gas_client.delete_dataset(dataset_name)
def ImageEmotionArtphoto(path: str) -> Dataset: """`Image Emotion-art Photo <https://www.imageemotion.org/>`_ dataset. The file structure should be like:: <path> <filename>.jpg ... Arguments: path: The root directory of the dataset. Returns: Loaded :class:`~tensorbay.dataset.dataset.Dataset` instance. """ root_path = os.path.abspath(os.path.expanduser(path)) dataset = Dataset(DATASET_NAME_ARTPHOTO) dataset.load_catalog( os.path.join(os.path.dirname(__file__), "catalog_artphoto.json")) segment = dataset.create_segment() image_paths = glob(os.path.join(root_path, "*.jpg")) for image_path in image_paths: image_category = os.path.basename(image_path).split("_", 1)[0] data = Data(image_path) data.label.classification = Classification(category=image_category) segment.append(data) return dataset
def DownsampledImagenet(path: str) -> Dataset: """`Downsampled Imagenet <https://www.tensorflow.org/datasets\ /catalog/downsampled_imagenet>`_ dataset. The file structure should be like:: <path> valid_32x32/ <imagename>.png ... valid_64x64/ <imagename>.png ... train_32x32/ <imagename>.png ... train_64x64/ <imagename>.png ... Arguments: path: The root directory of the dataset. Returns: Loaded :class:`~tensorbay.dataset.dataset.Dataset` instance. """ root_path = os.path.abspath(os.path.expanduser(path)) dataset = Dataset(DATASET_NAME) for segment_name in SEGMENT_NAMES: dataset.add_segment(_get_segment(segment_name, root_path)) return dataset
def test_import_cloud_files(self, accesskey, url, config_name): gas_client = GAS(access_key=accesskey, url=url) try: cloud_client = gas_client.get_cloud_client(config_name) except ResourceNotExistError: pytest.skip( f"skip this case because there's no {config_name} config") auth_data = cloud_client.list_auth_data("tests") dataset_name = get_dataset_name() dataset_client = gas_client.create_dataset(dataset_name, config_name=config_name) dataset = Dataset(name=dataset_name) segment = dataset.create_segment("Segment1") for data in auth_data: segment.append(data) dataset_client = gas_client.upload_dataset(dataset, jobs=5) dataset_client.commit("import data") segment1 = Segment("Segment1", client=dataset_client) assert len(segment1) == len(segment) assert segment1[0].path == segment[0].path.split("/")[-1] assert not segment1[0].label assert len(auth_data) == len(segment) gas_client.delete_dataset(dataset_name)
def AnimalsWithAttributes2(path: str) -> Dataset: """`Animals with attributes 2 <https://cvml.ist.ac.at/AwA2/>`_ dataset. The file structure should be like:: <path> classes.txt predicates.txt predicate-matrix-binary.txt JPEGImages/ <classname>/ <imagename>.jpg ... ... Arguments: path: The root directory of the dataset. Returns: Loaded :class:`~tensorbay.dataset.dataset.Dataset` instance. """ root_path = os.path.abspath(os.path.expanduser(path)) dataset = Dataset(DATASET_NAME) dataset.load_catalog( os.path.join(os.path.dirname(__file__), "catalog.json")) segment = dataset.create_segment() with open(os.path.join(root_path, "classes.txt"), encoding="utf-8") as fp: class_names = [line[:-1].split("\t", 1)[-1] for line in fp] with open(os.path.join(root_path, "predicates.txt"), encoding="utf-8") as fp: attribute_keys = [line[:-1].split("\t", 1)[-1] for line in fp] with open(os.path.join(root_path, "predicate-matrix-binary.txt"), encoding="utf-8") as fp: attribute_values = [line[:-1].split(" ") for line in fp] attribute_mapping = {} for class_name, values in zip(class_names, attribute_values): attribute_mapping[class_name] = Classification( category=class_name, attributes=dict( zip(attribute_keys, (bool(int(value)) for value in values))), ) for class_name in sorted(os.listdir(os.path.join(root_path, "JPEGImages"))): image_paths = glob( os.path.join(root_path, "JPEGImages", class_name, "*.jpg")) label = attribute_mapping[class_name] for image_path in image_paths: data = Data(image_path) data.label.classification = label segment.append(data) return dataset
def COVID_CT(path: str) -> Dataset: """`COVID-CT <https://github.com/UCSD-AI4H/COVID-CT>`_ dataset. The file structure should be like:: <path> Data-split/ COVID/ testCT_COVID.txt trainCT_COVID.txt valCT_COVID.txt NonCOVID/ testCT_NonCOVID.txt trainCT_NonCOVID.txt valCT_NonCOVID.txt Images-processed/ CT_COVID/ ... 2020.01.24.919183-p27-132.png 2020.01.24.919183-p27-133.png ... PIIS0140673620303603%8.png ... CT_NonCOVID/ 0.jpg 1%0.jog ... 91%1.jpg 102.png ... 2341.png Arguments: path: The root directory of the dataset. Returns: Loaded :class:`~tensorbay.dataset.dataset.Dataset` instance. """ root_path = os.path.abspath(os.path.expanduser(path)) dataset = Dataset(DATASET_NAME) dataset.load_catalog( os.path.join(os.path.dirname(__file__), "catalog.json")) data_split_path = os.path.join(root_path, "Data-split") images_processed_path = os.path.join(root_path, "Images-processed") for segment_name, (split_filename, image_dir, category) in _SEGMENT_TO_PATH.items(): segment = dataset.create_segment(segment_name) image_dir = os.path.join(images_processed_path, image_dir) with open(os.path.join(data_split_path, category, split_filename), "r", encoding="utf-8") as fp: for line in fp: image_path = os.path.join(image_dir, line.strip("\n")) data = Data(image_path) data.label.classification = Classification(category) segment.append(data) return dataset
def DeepRoute(path: str) -> Dataset: """`DeepRoute <https://gas.graviti.cn/dataset/graviti-open-dataset\ /DeepRoute>`_ dataset. The file structure should be like:: <path> pointcloud/ 00001.bin 00002.bin ... 10000.bin groundtruth/ 00001.txt 00002.txt ... 10000.txt Arguments: path: The root directory of the dataset. Returns: Loaded :class:`~tensorbay.dataset.dataset.Dataset` instance. """ root_path = os.path.abspath(os.path.expanduser(path)) dataset = Dataset(DATASET_NAME) dataset.load_catalog(os.path.join(os.path.dirname(__file__), "catalog.json")) segment = dataset.create_segment() point_cloud_paths = glob(os.path.join(root_path, "pointcloud", "*.bin")) for point_cloud_path in point_cloud_paths: point_cloud_id = os.path.splitext(os.path.basename(point_cloud_path))[0] label_path = os.path.join(root_path, "groundtruth", f"{point_cloud_id}.txt") data = Data(point_cloud_path) data.label.box3d = [] with open(label_path, encoding="utf-8") as fp: annotations = json.load(fp)["objects"] for annotation in annotations: bounding_box = annotation["bounding_box"] position = annotation["position"] label = LabeledBox3D( size=(bounding_box["length"], bounding_box["width"], bounding_box["height"]), translation=(position["x"], position["y"], position["z"]), rotation=from_rotation_vector((0, 0, annotation["heading"])), category=annotation["type"], ) data.label.box3d.append(label) segment.append(data) return dataset
def CCPD(path: str) -> Dataset: """`CCPD <https://github.com/detectRecog/CCPD>`_ dataset. The file structure should be like:: <path> ccpd_np/ 1005.jpg 1019.jpg ... ccpd_base/ 00205459770115-90_85-352&516_448&547- \ 444&547_368&549_364&517_440&515-0_0_22_10_26_29_24-128-7.jpg 00221264367816-91_91-283&519_381&553- \ 375&551_280&552_285&514_380&513-0_0_7_26_17_33_29-95-9.jpg ... ccpd_blur/ ccpd_challenge/ ccpd_db/ ccpd_fn/ ccpd_rotate/ ccpd_tilt/ ccpd_weather/ LICENSE README.md splits/ ccpd_blur.txt ccpd_challenge.txt ccpd_db.txt ccpd_fn.txt ccpd_rotate.txt ccpd_tilt.txt test.txt train.txt val.txt Arguments: path: The root directory of the dataset. Returns: Loaded :class: `~tensorbay.dataset.dataset.Dataset` instance. """ root_path = os.path.abspath(os.path.expanduser(path)) dataset = Dataset(DATASET_NAME_CCPD) dataset.load_catalog(os.path.join(os.path.dirname(__file__), "catalog.json")) for segment_head, segment_tails in _CCPD_SEGMENTS.items(): for segment_tail in segment_tails: segment_name = f"{segment_head}-{segment_tail}" segment = dataset.create_segment(segment_name) get_polygons = _get_polygons if segment_name != "other-np" else lambda _: [] for image_path in _get_ccpd_image_path(root_path, segment_head, segment_tail): data = Data(image_path) data.label.polygon = get_polygons(image_path) segment.append(data) return dataset
def VOC2012Segmentation(path: str) -> Dataset: """`VOC2012Segmentation <http://host.robots.ox.ac.uk/pascal/VOC/voc2012/>`_ dataset. The file structure should be like:: <path>/ JPEGImages/ <image_name>.jpg ... SegmentationClass/ <mask_name>.png ... SegmentationObject/ <mask_name>.png ... ImageSets/ Segmentation/ train.txt val.txt ... ... ... Arguments: path: The root directory of the dataset. Returns: Loaded :class: `~tensorbay.dataset.dataset.Dataset` instance. """ root_path = os.path.abspath(os.path.expanduser(path)) image_path = os.path.join(root_path, "JPEGImages") semantic_mask_path = os.path.join(root_path, "SegmentationClass") instance_mask_path = os.path.join(root_path, "SegmentationObject") image_set_path = os.path.join(root_path, "ImageSets", "Segmentation") dataset = Dataset(DATASET_NAME) dataset.load_catalog( os.path.join(os.path.dirname(__file__), "catalog.json")) for segment_name in _SEGMENT_NAMES: segment = dataset.create_segment(segment_name) with open(os.path.join(image_set_path, f"{segment_name}.txt"), encoding="utf-8") as fp: for stem in fp: stem = stem.strip() data = Data(os.path.join(image_path, f"{stem}.jpg")) label = data.label mask_filename = f"{stem}.png" label.semantic_mask = SemanticMask( os.path.join(semantic_mask_path, mask_filename)) label.instance_mask = InstanceMask( os.path.join(instance_mask_path, mask_filename)) segment.append(data) return dataset
def test_cache_dataset(self, accesskey, url, tmp_path): gas_client = GAS(access_key=accesskey, url=url) dataset_name = get_dataset_name() gas_client.create_dataset(dataset_name) dataset = Dataset(name=dataset_name) segment = dataset.create_segment("Segment1") # When uploading label, upload catalog first. dataset._catalog = Catalog.loads(_CATALOG) path = tmp_path / "sub" semantic_path = tmp_path / "semantic_mask" instance_path = tmp_path / "instance_mask" path.mkdir() semantic_path.mkdir() instance_path.mkdir() for i in range(_SEGMENT_LENGTH): local_path = path / f"hello{i}.txt" local_path.write_text("CONTENT") data = Data(local_path=str(local_path)) data.label = Label.loads(_LABEL) semantic_mask = semantic_path / f"semantic_mask{i}.png" semantic_mask.write_text("SEMANTIC_MASK") data.label.semantic_mask = SemanticMask(str(semantic_mask)) instance_mask = instance_path / f"instance_mask{i}.png" instance_mask.write_text("INSTANCE_MASK") data.label.instance_mask = InstanceMask(str(instance_mask)) segment.append(data) dataset_client = gas_client.upload_dataset(dataset) dataset_client.commit("commit-1") cache_path = tmp_path / "cache_test" dataset_client.enable_cache(str(cache_path)) segment1 = Segment("Segment1", client=dataset_client) for data in segment1: data.open() data.label.semantic_mask.open() data.label.instance_mask.open() segment_cache_path = (cache_path / dataset_client.dataset_id / dataset_client.status.commit_id / "Segment1") semantic_mask_cache_path = segment_cache_path / "semantic_mask" instance_mask_cache_path = segment_cache_path / "instance_mask" for cache_dir, extension in ( (segment_cache_path, "txt"), (semantic_mask_cache_path, "png"), (instance_mask_cache_path, "png"), ): assert set(cache_dir.glob(f"*.{extension}")) == set( cache_dir / f"hello{i}.{extension}" for i in range(_SEGMENT_LENGTH)) gas_client.delete_dataset(dataset_name)
def AnimalPose7(path: str) -> Dataset: """`7 Categories Animal-Pose <https://sites.google.com/view/animal-pose/>`_ dataset. The file structure should be like:: <path> bndbox_image/ antelope/ Img-77.jpg ... ... bndbox_anno/ antelope.json ... Arguments: path: The root directory of the dataset. Returns: loaded :class:`~tensorbay.dataset.dataset.Dataset` object. """ root_path = os.path.abspath(os.path.expanduser(path)) dataset = Dataset(DATASET_NAME_7) dataset.load_catalog( os.path.join(os.path.dirname(__file__), "catalog_7.json")) segment = dataset.create_segment() for animal in dataset.catalog.box2d.categories.keys(): with open(os.path.join(root_path, "bndbox_anno", f"{animal}.json"), encoding="utf-8") as fp: annotations = json.load(fp) for image_name, box2ds in annotations.items(): image_path = os.path.join(root_path, "bndbox_image", animal, image_name) data = Data(image_path, target_remote_path=f"{animal}/{image_name}") data.label.box2d = [] for box2d in box2ds: coordinates = box2d["bndbox"] data.label.box2d.append( LabeledBox2D( float(coordinates["xmin"]), float(coordinates["ymin"]), float(coordinates["xmax"]), float(coordinates["ymax"]), category=animal, )) segment.append(data) return dataset
def _BDD100K_loader(path: str, dataset_type: str) -> Dataset: root_path = os.path.join( os.path.abspath(os.path.expanduser(path)), f"bdd100k_images_{dataset_type}" ) dataset = Dataset(DATASET_NAMES[dataset_type]) dataset.load_catalog(os.path.join(os.path.dirname(__file__), f"catalog_{dataset_type}.json")) load_segment = _load_segment_10k if dataset_type == "10k" else _load_segment_100k labels_dir = os.path.join(root_path, "labels") load_segment(dataset, root_path, labels_dir) return dataset
def RarePlanesReal(path: str) -> Dataset: """`RarePlanesReal <https://www.cosmiqworks.org/RarePlanes/>`_ dataset. The folder structure should be like:: <path> metadata_annotations/ RarePlanes_Public_Metadata.csv RarePlanes_Test_Coco_Annotations_tiled.json RarePlanes_Train_Coco_Annotations_tiled.json test/ PS-RGB_tiled/ 105_104001003108D900_tile_47.png ... train/ PS-RGB_tiled/ 100_1040010029990A00_tile_319.png ... Arguments: path: The root directory of the dataset. Returns: Loaded :class:`~tensorbay.dataset.dataset.Dataset` instance. """ root_path = os.path.abspath(os.path.expanduser(path)) dataset = Dataset(DATASET_NAME) dataset.load_catalog( os.path.join(os.path.dirname(__file__), "catalog.json")) catalog = dataset.catalog annotations_dir = os.path.join(root_path, "metadata_annotations") classification_attributes = _get_classification_attributes( os.path.join(annotations_dir, "RarePlanes_Public_Metadata.csv"), catalog.classification.attributes.keys(), ) for segment_name in _SEGMENT_NAMES: segment = dataset.create_segment(segment_name) image_name_to_polygons = _get_polygon_labels( annotations_dir, segment_name, catalog.polygon.attributes.keys()) for image_path in glob( os.path.join(root_path, segment_name, "PS-RGB_tiled", "*.png")): data = Data(image_path) label = data.label filename = os.path.basename(image_path) image_id = filename.rsplit("_", 2)[0] label.polygon = image_name_to_polygons[filename] label.classification = Classification( attributes=classification_attributes[image_id]) segment.append(data) return dataset
def KenyanFoodType(path: str) -> Dataset: """`Kenyan Food Type <https://github.com/monajalal/Kenyan-Food>`_ dataset. The file structure should be like:: <path> test.csv test/ bhaji/ 1611654056376059197.jpg ... chapati/ 1451497832469337023.jpg ... ... train/ bhaji/ 190393222473009410.jpg ... chapati/ 1310641031297661755.jpg ... val/ bhaji/ 1615408264598518873.jpg ... chapati/ 1553618479852020228.jpg ... Arguments: path: The root directory of the dataset. Returns: Loaded :class:`~tensorbay.dataset.dataset.Dataset` instance. """ root_path = os.path.abspath(os.path.expanduser(path)) dataset = Dataset(DATASET_NAME_FOOD_TYPE) dataset.load_catalog( os.path.join(os.path.dirname(__file__), "catalog_food_type.json")) for segment_name in SEGMENTS_FOOD_TYPE: segment = dataset.create_segment(segment_name) segment_path = os.path.join(root_path, segment_name) for category in sorted(os.listdir(segment_path)): image_paths = glob(os.path.join(segment_path, category, "*.jpg")) label = Classification(category) for image_path in image_paths: data = Data(image_path) data.label.classification = label segment.append(data) return dataset
def LeedsSportsPose(path: str) -> Dataset: """`Leeds Sports Pose <http://sam.johnson.io/research/lsp.html>`_ dataset. The folder structure should be like:: <path> joints.mat images/ im0001.jpg im0002.jpg ... Arguments: path: The root directory of the dataset. Raises: ModuleImportError: When the module "scipy" can not be found. Returns: Loaded :class:`~tensorbay.dataset.dataset.Dataset` instance. """ try: from scipy.io import loadmat # pylint: disable=import-outside-toplevel except ModuleNotFoundError as error: raise ModuleImportError(module_name=error.name) from error root_path = os.path.abspath(os.path.expanduser(path)) dataset = Dataset(DATASET_NAME) dataset.load_catalog( os.path.join(os.path.dirname(__file__), "catalog.json")) segment = dataset.create_segment() mat = loadmat(os.path.join(root_path, "joints.mat")) joints = mat["joints"].T image_paths = glob(os.path.join(root_path, "images", "*.jpg")) for image_path in image_paths: data = Data(image_path) data.label.keypoints2d = [] index = int(os.path.basename(image_path) [2:6]) - 1 # get image index from "im0001.jpg" keypoints = LabeledKeypoints2D() for keypoint in joints[index]: keypoints.append( Keypoint2D(keypoint[0], keypoint[1], int(not keypoint[2]))) data.label.keypoints2d.append(keypoints) segment.append(data) return dataset