def test_upload_frame_without_sensor(self, accesskey, url, tmp_path): gas_client = GAS(access_key=accesskey, url=url) dataset_name = get_dataset_name() dataset_client = gas_client.create_dataset(dataset_name, is_fusion=True) dataset_client.create_draft("draft-1") segment_client = dataset_client.get_or_create_segment("segment1") path = tmp_path / "sub" path.mkdir() frame = Frame() local_path = path / "hello0.txt" local_path.write_text("CONTENT") data = Data(local_path=str(local_path)) frame[LIDAR_DATA["name"]] = data # If not uploading sensor, uploading frame is not allowed with pytest.raises(ResponseError): segment_client.upload_frame(frame, timestamp=0) gas_client.delete_dataset(dataset_name)
def test_upload_data_without_label(self, accesskey, url, tmp_path): gas_client = GAS(access_key=accesskey, url=url) dataset_name = get_dataset_name() dataset_client = gas_client.create_dataset(dataset_name) dataset_client.create_draft("draft-1") segment_client = dataset_client.get_or_create_segment("segment1") path = tmp_path / "sub" path.mkdir() for i in range(5): local_path = path / f"hello{i}.txt" local_path.write_text("CONTENT") segment_client.upload_data(Data(local_path=str(local_path))) data = segment_client.list_data() assert data[0].path == "hello0.txt" assert data[0].open().read() == b"CONTENT" assert not data[0].label # todo: match the input and output label gas_client.delete_dataset(dataset_name)
def _get_data_10k( image_path: str, original_mask_paths: Dict[str, str], label_content: Dict[str, Any], single_channel_mask_paths: Dict[str, str], ) -> Data: data = Data(image_path) polygon: List[LabeledPolygon] = [] for label_info in label_content["labels"]: if "poly2d" in label_info: _add_poly2d_label_10k(label_info, polygon) label = data.label label.polygon = polygon stem = os.path.splitext(os.path.basename(image_path))[0] label.semantic_mask = SemanticMask( os.path.join(original_mask_paths["sem"], f"{stem}.png")) label.instance_mask = _get_instance_mask(stem, original_mask_paths["ins"], single_channel_mask_paths["ins"]) label.panoptic_mask = _get_panoptic_mask(stem, original_mask_paths["pan"], single_channel_mask_paths["pan"]) return data
def _get_data_part2(root_path: str, aniamls: Iterable[str]) -> Iterator[Data]: try: import xmltodict # pylint: disable=import-outside-toplevel except ModuleNotFoundError as error: raise ModuleImportError(module_name=error.name) from error for animal in aniamls: for image_path in glob(os.path.join(root_path, "animalpose_image_part2", animal, "*.jpeg")): data = Data(image_path, target_remote_path=f"{animal}/{os.path.basename(image_path)}") annotation_path = os.path.join( root_path, "animalpose_anno2", animal, f"{os.path.splitext(os.path.basename(image_path))[0]}.xml", ) with open(annotation_path, encoding="utf-8") as fp: labels = xmltodict.parse(fp.read()) box2d = labels["annotation"]["visible_bounds"] data.label.box2d = [ LabeledBox2D.from_xywh( x=float(box2d["@xmin"]), y=float(box2d["@xmax"]), # xmax means ymin in the annotation width=float(box2d["@width"]), height=float(box2d["@height"]), category=animal, ) ] keypoints2d = LabeledKeypoints2D(category=animal) for keypoint in labels["annotation"]["keypoints"]["keypoint"]: keypoints2d.append( Keypoint2D( float(keypoint["@x"]), float(keypoint["@y"]), int(keypoint["@visible"]) ) ) data.label.keypoints2d = [keypoints2d] yield data
def KylbergTexture(path: str) -> Dataset: """`Kylberg Texture <http://www.cb.uu.se/~gustaf/texture/>`_ dataset. The file structure should be like:: <path> originalPNG/ <imagename>.png ... withoutRotateAll/ <imagename>.png ... RotateAll/ <imagename>.png ... Arguments: path: The root directory of the dataset. Returns: Loaded :class:`~tensorbay.dataset.dataset.Dataset` instance. """ root_path = os.path.abspath(os.path.expanduser(path)) dataset = Dataset(DATASET_NAME) dataset.load_catalog(os.path.join(os.path.dirname(__file__), "catalog.json")) for segment_name, label_getter in _LABEL_GETTERS.items(): image_paths = glob(os.path.join(root_path, segment_name, "*.png")) segment = dataset.create_segment(segment_name) for image_path in image_paths: data = Data(image_path) stem = os.path.splitext(os.path.basename(image_path))[0] data.label.classification = label_getter(stem) segment.append(data) return dataset
def test_cache_fusion_dataset(self, accesskey, url, tmp_path): gas_client = GAS(access_key=accesskey, url=url) dataset_name = get_dataset_name() dataset_client = gas_client.create_dataset(dataset_name, is_fusion=True) dataset_client.create_draft("draft-1") segment = FusionSegment("Segment1") segment.sensors = Sensors.loads(_SENSORS_DATA) paths = {"Lidar1": tmp_path / "lidar", "Camera1": tmp_path / "camera"} for path in paths.values(): path.mkdir() for i in range(_SEGMENT_LENGTH): frame = Frame() for sensor_data in _SENSORS_DATA: sensor_name = sensor_data["name"] data_path = paths[sensor_name] / f"{sensor_name}{i}.txt" data_path.write_text("CONTENT") frame[sensor_name] = Data(local_path=str(data_path)) segment.append(frame) dataset_client.upload_segment(segment) dataset_client.commit("commit-1") cache_path = tmp_path / "cache_test" dataset_client.enable_cache(str(cache_path)) segment1 = FusionSegment(name="Segment1", client=dataset_client) for frame in segment1: for data in frame.values(): data.open() segment_cache_path = (cache_path / dataset_client.dataset_id / dataset_client.status.commit_id / "Segment1") correct_files = set( segment_cache_path / f'{sensor_data["name"]}{i}.txt' for i in range(_SEGMENT_LENGTH) for sensor_data in _SENSORS_DATA) assert set(segment_cache_path.glob("*.txt")) == correct_files gas_client.delete_dataset(dataset_name)
def CoinImage(path: str) -> Dataset: """`Coin Image <https://cvl.tuwien.ac.at/research/cvl-databases/coin-image-dataset/>`_ dataset. The file structure should be like:: <path> classes.csv <imagename>.png ... Arguments: path: The root directory of the dataset. Returns: Loaded :class:`~tensorbay.dataset.dataset.Dataset` instance. """ root_path = os.path.abspath(os.path.expanduser(path)) dataset = Dataset(DATASET_NAME) dataset.load_catalog( os.path.join(os.path.dirname(__file__), "catalog.json")) segment = dataset.create_segment() csv_path = os.path.join(root_path, "classes.csv") with open(csv_path, "r", encoding="utf-8") as fp: reader = csv.reader(fp, delimiter=";") mapping: Dict[str, str] = dict( row for row in reader) # type: ignore[arg-type, misc] image_paths = glob(os.path.join(root_path, "*.png")) for image_path in image_paths: data = Data(image_path) filename = os.path.basename(image_path) class_id = filename[5:].split("_", 1)[0] data.label.classification = Classification(category=mapping[class_id]) segment.append(data) return dataset
def BioIDFace(path: str) -> Dataset: """`BioID Face <https://www.bioid.com/facedb/>`_ dataset. The folder structure should be like:: <path> BioID-FaceDatabase-V1.2/ BioID_0000.eye BioID_0000.pgm ... points_20/ bioid_0000.pts Arguments: path: The root directory of the dataset. Returns: Loaded :class:`~tensorbay.dataset.dataset.Dataset` instance. """ root_path = os.path.abspath(os.path.expanduser(path)) dataset = Dataset(DATASET_NAME) dataset.load_catalog( os.path.join(os.path.dirname(__file__), "catalog.json")) segment = dataset.create_segment() image_paths = glob( os.path.join(root_path, "BioID-FaceDatabase-V1.2", "*.pgm")) face_keypoints_paths = glob(os.path.join(root_path, "points_20", "*.pts")) for image_path, face_keypoints_path in zip(image_paths, face_keypoints_paths): data = Data(image_path) data.label.keypoints2d = _get_label( f"{os.path.splitext(image_path)[0]}.eye", face_keypoints_path) segment.append(data) return dataset
def ImageEmotionAbstract(path: str) -> Dataset: """`Image Emotion-abstract <https://www.imageemotion.org/>`_ dataset. The file structure should be like:: <path> ABSTRACT_groundTruth.csv abstract_xxxx.jpg ... Arguments: path: The root directory of the dataset. Returns: Loaded :class:`~tensorbay.dataset.dataset.Dataset` instance. """ root_path = os.path.abspath(os.path.expanduser(path)) dataset = Dataset(DATASET_NAME_ABSTRACT) dataset.load_catalog(os.path.join(os.path.dirname(__file__), "catalog_abstract.json")) segment = dataset.create_segment() csv_path = os.path.join(root_path, "ABSTRACT_groundTruth.csv") with open(csv_path, encoding="utf-8") as fp: reader = csv.DictReader(fp) reader.fieldnames = [ field.strip("'") for field in reader.fieldnames # type:ignore[union-attr] ] for row in reader: image_path = os.path.join(root_path, row.pop("").strip("'")) data = Data(image_path) values = {key: int(value) for key, value in row.items()} data.label.classification = Classification(attributes=values) segment.append(data) return dataset
def _get_mots_data( image_path: str, original_mask_subdir: str, semantic_subdir: str, instance_subdir: str, stem: str, *, label_content: Dict[str, Any], ) -> Data: data = Data(image_path) labeled_multipolygons = [] for label_info in label_content.get("labels", ()): if "poly2d" not in label_info: continue labeled_multipolygon = LabeledMultiPolygon( polygons=(poly2d_info["vertices"] for poly2d_info in label_info["poly2d"]), category=label_info["category"], attributes=label_info["attributes"], instance=str(label_info["id"]), ) labeled_multipolygons.append(labeled_multipolygon) semantic_path = os.path.join(semantic_subdir, f"{stem}.png") instance_path = os.path.join(instance_subdir, f"{stem}.png") mask_info = _save_and_get_mask_info( os.path.join(original_mask_subdir, f"{stem}.png"), semantic_path, instance_path, os.path.join(instance_subdir, f"{stem}.json"), ) ins_mask = InstanceMask(instance_path) ins_mask.all_attributes = mask_info["all_attributes"] label = data.label label.multi_polygon = labeled_multipolygons label.semantic_mask = SemanticMask(semantic_path) label.instance_mask = ins_mask return data
def DogsVsCats(path: str) -> Dataset: """`Dogs vs Cats <https://www.kaggle.com/c/dogs-vs-cats>`_ dataset. The file structure should be like:: <path> train/ cat.0.jpg ... dog.0.jpg ... test/ 1000.jpg 1001.jpg ... Arguments: path: The root directory of the dataset. Returns: Loaded :class:`~tensorbay.dataset.dataset.Dataset` instance. """ root_path = os.path.abspath(os.path.expanduser(path)) dataset = Dataset(DATASET_NAME) dataset.load_catalog( os.path.join(os.path.dirname(__file__), "catalog.json")) for segment_name, is_labeled in _SEGMENTS.items(): segment = dataset.create_segment(segment_name) image_paths = glob(os.path.join(root_path, segment_name, "*.jpg")) for image_path in image_paths: data = Data(image_path) if is_labeled: data.label.classification = Classification( os.path.basename(image_path)[:3]) segment.append(data) return dataset
def _load_positive_segment(segment_name: str, segment_path: str) -> Segment: if segment_name.startswith("vid"): # Pad zero for segment name to change "vid0" to "vid00" segment_name = f"{segment_name[:3]}{int(segment_name[3:]):02}" segment = Segment(segment_name) annotation_file = glob( os.path.join(segment_path, "frameAnnotations-*", "frameAnnotations.csv") )[0] image_folder = os.path.dirname(annotation_file) pre_filename = "" with open(annotation_file, encoding="utf-8") as fp: for annotation in csv.DictReader(fp, delimiter=";"): filename = annotation["Filename"] if filename != pre_filename: data = Data(os.path.join(image_folder, filename)) data.label.box2d = [] segment.append(data) pre_filename = filename occluded, on_another_road = annotation["Occluded,On another road"].split(",", 1) data.label.box2d.append( LabeledBox2D( int(annotation["Upper left corner X"]), int(annotation["Upper left corner Y"]), int(annotation["Lower right corner X"]), int(annotation["Lower right corner Y"]), category=annotation["Annotation tag"], attributes={ "Occluded": bool(int(occluded)), "On another road": bool(int(on_another_road)), "Origin file": annotation["Origin file"], "Origin frame number": int(annotation["Origin frame number"]), "Origin track": annotation["Origin track"], "Origin track frame number": int(annotation["Origin track frame number"]), }, ) ) return segment
def KenyanFoodOrNonfood(path: str) -> Dataset: """`Kenyan Food or Nonfood <https://github.com/monajalal/Kenyan-Food>`_ dataset. The file structure should be like:: <path> images/ food/ 236171947206673742.jpg ... nonfood/ 168223407.jpg ... data.csv split.py test.txt train.txt Arguments: path: The root directory of the dataset. Returns: Loaded :class:`~tensorbay.dataset.dataset.Dataset` instance. """ root_path = os.path.abspath(os.path.expanduser(path)) dataset = Dataset(DATASET_NAME_FOOD_OR_NONFOOD) dataset.load_catalog(os.path.join(os.path.dirname(__file__), "catalog_food_or_nonfood.json")) for segment_name, filename in SEGMENTS_FOOD_OR_NONFOOD.items(): segment = dataset.create_segment(segment_name) with open(os.path.join(root_path, filename), encoding="utf-8") as fp: for image_path in fp: image_path = os.path.join(root_path, image_path) data = Data(image_path.strip()) category = image_path.split("/")[1] data.label.classification = Classification(category) segment.append(data) return dataset
def _get_data(path: str, annotations: Any, flag: bool) -> Iterator[Tuple[Data, str]]: filepath_to_data: Dict[str, Data] = {} for annotation in annotations: filepath = annotation["filepath"][0] keypoints = LabeledKeypoints2D( annotation["coords"].T[_VALID_KEYPOINT_INDICES], attributes={ "poselet_hit_idx": annotation["poselet_hit_idx"].T.tolist() }, ) box2d = LabeledBox2D(*annotation["torsobox"][0].tolist()) if filepath not in filepath_to_data: data = Data(os.path.join(path, "images", filepath)) data.label.keypoints2d = [keypoints] data.label.box2d = [box2d] attribute = {"currframe": int(annotation["currframe"][0][0])} if flag: attribute["isunchecked"] = bool(annotation["isunchecked"]) data.label.classification = Classification( category=annotation["moviename"][0], attributes=attribute) filepath_to_data[filepath] = data if annotation["istrain"]: segment_name = "train" elif annotation["istest"]: segment_name = "test" else: segment_name = "bad" yield data, segment_name else: image_data = filepath_to_data[filepath] image_data.label.keypoints2d.append(keypoints) image_data.label.box2d.append(box2d)
def TLR(path: str) -> Dataset: """`TLR <http://www.lara.prd.fr/benchmarks/trafficlightsrecognition>`_ dataset. The file structure should like:: <path> root_path/ Lara3D_URbanSeq1_JPG/ frame_011149.jpg frame_011150.jpg frame_<frame_index>.jpg ... Lara_UrbanSeq1_GroundTruth_cvml.xml Arguments: path: The root directory of the dataset. Returns: Loaded :class:`~tensorbay.dataset.dataset.Dataset` instance. """ root_path = os.path.abspath(os.path.expanduser(path)) dataset = Dataset(DATASET_NAME) dataset.load_catalog( os.path.join(os.path.dirname(__file__), "catalog.json")) segment = dataset.create_segment() file_paths = glob(os.path.join(root_path, "Lara3D_UrbanSeq1_JPG", "*.jpg")) labels = _parse_xml( os.path.join(root_path, "Lara_UrbanSeq1_GroundTruth_cvml.xml")) for file_path in file_paths: # the image file name looks like: # frame_000001.jpg frame_index = int(os.path.basename(file_path)[6:-4]) data = Data(file_path) data.label.box2d = labels[frame_index] segment.append(data) return dataset
def get_voc_detection_data( stem: str, image_path: str, annotation_path: str, boolean_attributes: List[str] ) -> Data: """Get all information of the datum corresponding to voc-like label files. Arguments: stem: The filename without extension of the data. image_path: The path of the image directory. annotation_path: The path of the annotation directory. boolean_attributes: The list of boolean attribute. Returns: Data: class:`~tensorbay.dataset.data.Data` instance. """ data = Data(os.path.join(image_path, f"{stem}.jpg")) box2d = [] with open(os.path.join(annotation_path, f"{stem}.xml"), encoding="utf-8") as fp: labels: Any = xmltodict.parse(fp.read()) objects = labels["annotation"]["object"] if not isinstance(objects, list): objects = [objects] for obj in objects: attributes = {attribute: bool(int(obj[attribute])) for attribute in boolean_attributes} attributes["pose"] = obj["pose"] bndbox = obj["bndbox"] box2d.append( LabeledBox2D( float(bndbox["xmin"]), float(bndbox["ymin"]), float(bndbox["xmax"]), float(bndbox["ymax"]), category=obj["name"], attributes=attributes, ) ) data.label.box2d = box2d return data
def _load_segment_10k(dataset: Dataset, root_path: str, labels_dir: str) -> None: for segment_name in _SEGMENT_NAMES: segment = dataset.create_segment(segment_name) image_paths = glob( os.path.join(root_path, "images", "10k", segment_name, "*.jpg")) print(f"Reading data to segment '{segment_name}'...") if segment_name == "test": for image_path in image_paths: segment.append(Data(image_path)) else: single_channel_mask_dirs: Dict[str, str] = {} original_mask_dirs: Dict[str, str] = {} for seg_type, dir_names in _SEGMENTATIONS_INFO.items(): original_mask_dirs[seg_type] = os.path.join( labels_dir, *dir_names, segment_name) if seg_type != "sem": single_channel_mask_dir = os.path.join( labels_dir, "single_channel_mask", segment_name, dir_names[0], ) single_channel_mask_dirs[ seg_type] = single_channel_mask_dir os.makedirs(single_channel_mask_dir, exist_ok=True) label_contents = _read_label_file_10k(labels_dir, segment_name) for image_path in image_paths: segment.append( _get_data_10k( image_path, original_mask_dirs, label_contents[os.path.basename(image_path)], single_channel_mask_dirs, )) print(f"Finished reading data to segment '{segment_name}'")
def Elpv(path: str) -> Dataset: """`elpv <https://github.com/zae-bayern/elpv-dataset>`_ dataset. The file structure should be like:: <path> labels.csv images/ cell0001.png ... Arguments: path: The root directory of the dataset. Returns: Loaded :class:`~tensorbay.dataset.dataset.Dataset` instance. """ root_path = os.path.abspath(os.path.expanduser(path)) dataset = Dataset(DATASET_NAME) dataset.load_catalog( os.path.join(os.path.dirname(__file__), "catalog.json")) segment = dataset.create_segment() csv_path = os.path.join(root_path, "labels.csv") with open(csv_path, encoding="utf-8") as csv_file: for row in csv_file: image_name, attributes, category = row.strip().split() dirname, basename = image_name.split("/") image_path = os.path.join(root_path, dirname, basename) data = Data(image_path) data.label.classification = Classification( attributes={"defect probability": float(attributes)}, category=category) segment.append(data) return dataset
def test_upload_segment_with_file(self, accesskey, url, tmp_path): gas_client = GAS(access_key=accesskey, url=url) dataset_name = get_dataset_name() dataset_client = gas_client.create_dataset(dataset_name) dataset_client.create_draft("draft-1") segment = Segment("segment1") path = tmp_path / "sub" path.mkdir() for i in range(10): local_path = path / f"hello{i}.txt" local_path.write_text("CONTENT") data = Data(local_path=str(local_path)) segment.append(data) dataset_client.upload_segment(segment) segment1 = Segment(name="segment1", client=dataset_client) assert len(segment1) == 10 assert segment1[0].get_url() assert segment1[0].path == segment[0].target_remote_path gas_client.delete_dataset(dataset_name)
def _get_data( image_path: str, original_mask_dir: str, annotation_dir: str, new_mask_dir: str, category_ids: Dict[str, int], ) -> Data: stem = os.path.splitext(os.path.basename(image_path))[0] new_mask_path = os.path.join(new_mask_dir, f"{stem}.png") data = Data(image_path) label = data.label with open(os.path.join(annotation_dir, f"{stem}.xml"), encoding="utf-8") as fp: labels: Any = xmltodict.parse(fp.read())["image"] label.box2d, label.panoptic_mask = _get_box2d_and_panoptic_mask( labels["object"], os.path.join(original_mask_dir, f"{stem}_mask.png"), new_mask_path, category_ids, ) label.classification = _get_classification( labels["JSON_Variation_Parameters"]["parameter"]) return data
def test_copy_data_from_commits(self, accesskey, url, tmp_path): gas_client = GAS(access_key=accesskey, url=url) dataset_name = get_dataset_name() gas_client.create_dataset(dataset_name) dataset = Dataset(name=dataset_name) segment = dataset.create_segment("Segment1") dataset._catalog = Catalog.loads(CATALOG) path = tmp_path / "sub" path.mkdir() for i in range(10): local_path = path / f"hello{i}.txt" local_path.write_text("CONTENT") data = Data(local_path=str(local_path)) data.label = Label.loads(LABEL) segment.append(data) dataset_client = gas_client.upload_dataset(dataset) dataset_client.commit("commit_1") for i in range(10, 20): local_path = path / f"hello{i}.txt" local_path.write_text("CONTENT") data = Data(local_path=str(local_path)) data.label = Label.loads(LABEL) segment.append(data) dataset_client = gas_client.upload_dataset(dataset) dataset_client.commit("commit_2") dataset_client_1 = gas_client.get_dataset(dataset_name) commit_id = dataset_client_1.list_commits()[-1].commit_id dataset_client_1.checkout(revision=commit_id) dataset_client.create_draft("draft_3") segment_client_1 = dataset_client_1.get_segment("Segment1") segment_client_2 = dataset_client.get_segment("Segment1") segment_client_2.copy_data("hello0.txt", "goodbye0.txt", source_client=segment_client_1) segment2 = Segment("Segment1", client=dataset_client) assert segment2[0].path == "goodbye0.txt" assert segment2[0].path != segment[0].target_remote_path assert segment2[0].label assert len(segment2) == 21 gas_client.delete_dataset(dataset_name)
def test_upload_dataset_only_with_file(self, accesskey, url, tmp_path): gas_client = GAS(access_key=accesskey, url=url) dataset_name = get_random_dataset_name() gas_client.create_dataset(dataset_name) dataset = Dataset(name=dataset_name) segment = dataset.create_segment("Segment1") path = tmp_path / "sub" path.mkdir() for i in range(10): local_path = path / f"hello{i}.txt" local_path.write_text("CONTENT") segment.append(Data(local_path=str(local_path))) dataset_client = gas_client.upload_dataset(dataset) assert not dataset_client.get_catalog() segment1 = Segment("Segment1", client=dataset_client) assert len(segment1) == 10 assert segment1[0].path == "hello0.txt" assert not segment1[0].label gas_client.delete_dataset(dataset_name)
def HardHatWorkers(path: str) -> Dataset: """`Hard Hat Workers <https://makeml.app/datasets/hard-hat-workers>`_ dataset. The file structure should be like:: <path> annotations/ hard_hat_workers0.xml ... images/ hard_hat_workers0.png ... Arguments: path: The root directory of the dataset. Returns: Loaded :class:`~tensorbay.dataset.dataset.Dataset` instance. """ root_path = os.path.abspath(os.path.expanduser(path)) annotation_dir = os.path.join(root_path, "annotations") dataset = Dataset(DATASET_NAME) dataset.load_catalog( os.path.join(os.path.dirname(__file__), "catalog.json")) segment = dataset.create_segment() image_paths = glob(os.path.join(root_path, "images", "*.png")) for image_path in image_paths: data = Data(image_path) data.label.box2d = _load_labels( os.path.join( annotation_dir, f"{os.path.splitext(os.path.basename(image_path))[0]}.xml")) segment.append(data) return dataset
def _generate_data(image_path: str, labels: Dict[str, Any]) -> Data: data = Data(image_path) data.label.box2d = [] image_id = labels["image_name_id_map"][os.path.basename(image_path)] image_annotations_map = labels["image_annotations_map"] if image_id not in image_annotations_map: return data annotations = labels["annotations"] poses = labels["poses"] categories = labels["categories"] for annotation_id in image_annotations_map[image_id]: annotation = annotations[annotation_id] x_top, y_top, width, height = annotation["bbox"] attributes = { "occluded": annotation["occluded"], "difficult": annotation["difficult"], "pose": poses[annotation["pose_id"] - 1]["name"], "truncated": annotation["truncated"], } data.label.box2d.append( LabeledBox2D.from_xywh( x=x_top, y=y_top, width=width, height=height, category=categories[annotation["category_id"]]["name"], attributes=attributes, instance=str(annotation["tracking_id"]), )) return data
def THCHS30(path: str) -> Dataset: """`THCHS-30 <http://166.111.134.19:7777/data/thchs30/README.html>`_ dataset. The file structure should be like:: <path> lm_word/ lexicon.txt data/ A11_0.wav.trn ... dev/ A11_101.wav ... train/ test/ Arguments: path: The root directory of the dataset. Returns: Loaded :class:`~tensorbay.dataset.dataset.Dataset` instance. """ dataset = Dataset(DATASET_NAME) dataset.catalog.sentence = _get_subcatalog( os.path.join(path, "lm_word", "lexicon.txt")) for segment_name in _SEGMENT_NAME_LIST: segment = dataset.create_segment(segment_name) for filename in glob(os.path.join(path, segment_name, "*.wav")): data = Data(filename) label_file = os.path.join(path, "data", os.path.basename(filename) + ".trn") data.label.sentence = _get_label(label_file) segment.append(data) return dataset
def PASCALContext(mask_path: str, image_path: str) -> Dataset: """`PASCALContext <https://cs.stanford.edu/~roozbeh/pascal-context/>`_ dataset. The file structure should be like:: <mask_path> <image_name>.png ... <image_path> <image_name>.jpg ... Arguments: mask_path: The root directory of the dataset mask. image_path: The root directory of the dataset image. Returns: Loaded :class: `~tensorbay.dataset.dataset.Dataset` instance. """ root_mask_path = os.path.abspath(os.path.expanduser(mask_path)) root_image_path = os.path.abspath(os.path.expanduser(image_path)) dataset = Dataset(DATASET_NAME) dataset.load_catalog( os.path.join(os.path.dirname(__file__), "catalog.json")) segment = dataset.create_segment("trainval") for mask_filename in glob(os.path.join(root_mask_path, "*.png")): stem = os.path.splitext(os.path.basename(mask_filename))[0] data = Data(os.path.join(root_image_path, f"{stem}.jpg")) data.label.semantic_mask = SemanticMask(mask_filename) segment.append(data) return dataset
def THUCNews(path: str) -> Dataset: """`THUCNews <http://thuctc.thunlp.org/>`_ dataset. The folder structure should be like:: <path> <category>/ 0.txt 1.txt 2.txt 3.txt ... <category>/ ... Arguments: path: The root directory of the dataset. Returns: Loaded :class:`~tensorbay.dataset.dataset.Dataset` instance. """ root_path = os.path.abspath(os.path.expanduser(path)) dataset = Dataset(DATASET_NAME) dataset.load_catalog(os.path.join(os.path.dirname(__file__), "catalog.json")) segment = dataset.create_segment() for category in dataset.catalog.classification.categories.keys(): text_paths = glob(os.path.join(root_path, category, "*.txt")) for text_path in text_paths: data = Data(text_path) data.label.classification = Classification(category) segment.append(data) return dataset
def test__upload_segment(self, mocker): segment_test = Segment(name="test1") for i in range(5): segment_test.append(Data(f"data{i}.png")) segment_client = SegmentClient(name="test1", data_client=self.dataset_client) get_or_create_segment = mocker.patch( f"{dataset.__name__}.DatasetClient.get_or_create_segment", return_value=segment_client ) list_data_paths = mocker.patch( f"{segment.__name__}.SegmentClient.list_data_paths", return_value=["data1.png", "data2.png"], ) multithread_upload = mocker.patch(f"{dataset.__name__}.multithread_upload") with Tqdm(5, disable=False) as pbar: self.dataset_client._upload_segment(segment_test, skip_uploaded_files=True, pbar=pbar) get_or_create_segment.assert_called_once_with(segment_test.name) list_data_paths.assert_called_once_with() args, keywords = multithread_upload.call_args assert args[0] == segment_client._upload_or_import_data assert [item.path for item in args[1]] == ["data0.png", "data3.png", "data4.png"] assert keywords["callback"] == segment_client._synchronize_upload_info assert keywords["jobs"] == 1 assert keywords["pbar"] == pbar multithread_upload.assert_called_once() with Tqdm(5, disable=False) as pbar: self.dataset_client._upload_segment(segment_test, skip_uploaded_files=False, pbar=pbar) get_or_create_segment.assert_called_with(segment_test.name) list_data_paths.assert_called_with() args, keywords = multithread_upload.call_args assert args[0] == segment_client._upload_or_import_data assert [item.path for item in args[1]] == [f"data{i}.png" for i in range(5)] assert keywords["callback"] == segment_client._synchronize_upload_info assert keywords["jobs"] == 1 assert keywords["pbar"] == pbar multithread_upload.assert_called()
def init_dataset_client(accesskey, url, tmp_path_factory): gas_client = GAS(access_key=accesskey, url=url) dataset_name = get_dataset_name() dataset_client = gas_client.create_dataset(dataset_name) dataset_client.create_draft("draft-1") dataset_client.commit("commit-1") dataset_client.create_branch("dev") dataset = Dataset(name=dataset_name) segment = dataset.create_segment("Segment1") dataset._catalog = Catalog.loads(CATALOG) path = tmp_path_factory.mktemp("sub") os.makedirs(path, exist_ok=True) for i in range(10): local_path = path / f"hello{i}.txt" local_path.write_text(f"CONTENT_{i}") data = Data(local_path=str(local_path)) data.label = Label.loads(LABEL_2) segment.append(data) dataset_client = gas_client.upload_dataset(dataset, branch_name="dev") dataset_client.commit("commit-2") dataset_client.checkout(DEFAULT_BRANCH) dataset = Dataset(name=dataset_name) segment = dataset.create_segment("Segment1") dataset._catalog = Catalog.loads(CATALOG) path = tmp_path_factory.mktemp("sub") os.makedirs(path, exist_ok=True) for i in range(4): local_path = path / f"hello{i}.txt" local_path.write_text(f"CONTENT_{i}") data = Data(local_path=str(local_path)) data.label = Label.loads(LABEL_1) segment.append(data) dataset_client = gas_client.upload_dataset(dataset, branch_name=DEFAULT_BRANCH) dataset_client.commit("commit-3") yield dataset_client gas_client.delete_dataset(dataset_name)
def test_move_segment_override(self, accesskey, url, tmp_path): gas_client = GAS(access_key=accesskey, url=url) dataset_name = get_dataset_name() gas_client.create_dataset(dataset_name) dataset = Dataset(name=dataset_name) segment1 = dataset.create_segment("Segment1") dataset._catalog = Catalog.loads(CATALOG) path = tmp_path / "sub" path.mkdir() for i in range(10): local_path = path / f"hello{i}.txt" local_path.write_text("CONTENT_1") data = Data(local_path=str(local_path)) data.label = Label.loads(LABEL) segment1.append(data) segment2 = dataset.create_segment("Segment2") for i in range(10, 20): local_path = path / f"hello{i}.txt" local_path.write_text("CONTENT_2") data = Data(local_path=str(local_path)) data.label = Label.loads(LABEL) segment2.append(data) dataset_client = gas_client.upload_dataset(dataset) dataset_client.move_segment("Segment1", "Segment2", strategy="override") with pytest.raises(ResourceNotExistError): dataset_client.get_segment("Segment1") segment_moved = Segment("Segment2", client=dataset_client) assert segment_moved[0].path == "hello0.txt" assert segment_moved[0].path == segment1[0].target_remote_path assert segment_moved[0].open().read() == b"CONTENT_1" assert segment_moved[0].label gas_client.delete_dataset(dataset_name)