def dataset_convert(dataset_identifier: str, format: str, output_dir: Optional[PathLike] = None) -> None: """ Converts the annotations from the given dataset to the given format. Exits the application if no dataset with the given slug exists or no releases for the dataset were previously pulled. Parameters ---------- dataset_identifier: str The dataset identifier, normally in the "<team-slug>/<dataset-slug>:<version>" form. format: str The format we want to convert to. output_dir: Optional[PathLike] The folder where the exported annotation files will be. If None it will be the inside the annotations folder of the dataset under 'other_formats/{format}'. The Defaults to None. """ identifier: DatasetIdentifier = DatasetIdentifier.parse(dataset_identifier) client: Client = _load_client(team_slug=identifier.team_slug) try: parser: ExportParser = get_exporter(format) dataset: RemoteDataset = client.get_remote_dataset( dataset_identifier=identifier) if not dataset.local_path.exists(): _error( f"No annotations downloaded for dataset f{dataset}, first pull a release using " f"'darwin dataset pull {identifier}'") release_path: Path = get_release_path(dataset.local_path, identifier.version) annotations_path: Path = release_path / "annotations" if output_dir is None: output_dir = release_path / "other_formats" / format else: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) export_annotations(parser, [annotations_path], output_dir) except ExporterNotFoundError: _error( f"Unsupported export format: {format}, currently supported: {export_formats}" ) except AttributeError: _error( f"Unsupported export format: {format}, currently supported: {export_formats}" ) except NotFound as e: _error(f"No dataset with name '{e.name}'")
def dataset_convert(dataset_slug: str, format: str, output_dir: Optional[Union[str, Path]]): client = _load_client() parser = find_supported_format(format, darwin.exporter.formats.supported_formats) try: dataset = client.get_remote_dataset(dataset_identifier=dataset_slug) if not dataset.local_path.exists(): _error( f"No annotations downloaded for dataset f{dataset}, first pull a release using " f"'darwin dataset pull {dataset_slug}'" ) release_path = get_release_path(dataset.local_path) annotations_path = release_path / "annotations" if output_dir is None: output_dir = release_path / "other_formats" / f"{format}" else: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) exporter.export_annotations(parser, [annotations_path], output_dir) except NotFound as e: _error(f"No dataset with name '{e.name}'")
def __init__( self, dataset_path: Path, annotation_type: str, partition: Optional[str] = None, split: str = "default", split_type: str = "random", release_name: Optional[str] = None, ): """ Creates a dataset Parameters ---------- dataset_path: Path, str Path to the location of the dataset on the file system annotation_type: str The type of annotation classes [tag, bounding_box, polygon] partition: str Selects one of the partitions [train, val, test] split: str Selects the split that defines the percentages used (use 'default' to select the default split) split_type: str Heuristic used to do the split [random, stratified] release_name: str Version of the dataset """ assert dataset_path is not None release_path = get_release_path(dataset_path, release_name) annotations_dir = release_path / "annotations" assert annotations_dir.exists() images_dir = dataset_path / "images" assert images_dir.exists() if partition not in ["train", "val", "test", None]: raise ValueError("partition should be either 'train', 'val', or 'test'") if split_type not in ["random", "stratified"]: raise ValueError("split_type should be either 'random', 'stratified'") if annotation_type not in ["tag", "polygon", "bounding_box"]: raise ValueError("annotation_type should be either 'tag', 'bounding_box', or 'polygon'") self.dataset_path = dataset_path self.annotation_type = annotation_type self.images_path: List[Path] = [] self.annotations_path: List[Path] = [] self.original_classes = None self.original_images_path: Optional[List[Path]] = None self.original_annotations_path: Optional[List[Path]] = None # Get the list of classes self.classes = get_classes( self.dataset_path, release_name, annotation_type=self.annotation_type, remove_background=True ) self.num_classes = len(self.classes) # Get the list of stems if partition: # Get the split if split_type == "random": split_file = f"{split_type}_{partition}.txt" elif split_type == "stratified": split_file = f"{split_type}_{annotation_type}_{partition}.txt" split_path = release_path / "lists" / split / split_file if split_path.is_file(): stems = (e.strip() for e in split_path.open()) else: raise FileNotFoundError( f"could not find a dataset partition. " f"Split the dataset using `split_dataset()` from `darwin.dataset.utils`" ) from None else: # If the partition is not specified, get all the annotations stems = [e.stem for e in annotations_dir.glob("*.json")] # Find all the annotations and their corresponding images for stem in stems: annotation_path = annotations_dir / f"{stem}.json" images = [] for ext in SUPPORTED_IMAGE_EXTENSIONS: image_path = images_dir / f"{stem}{ext}" if image_path.exists(): images.append(image_path) if len(images) < 1: raise ValueError(f"Annotation ({annotation_path}) does not have a corresponding image") if len(images) > 1: raise ValueError(f"Image ({stem}) is present with multiple extensions. This is forbidden.") assert len(images) == 1 self.images_path.append(images[0]) self.annotations_path.append(annotation_path) if len(self.images_path) == 0: raise ValueError(f"Could not find any {SUPPORTED_IMAGE_EXTENSIONS} file", f" in {images_dir}") assert len(self.images_path) == len(self.annotations_path)
def it_uses_provided_version_name_otherwise(team_dataset_path: Path): test_release_path = team_dataset_path / "releases" / "test" test_release_path.mkdir(parents=True) assert get_release_path(team_dataset_path, "test") == test_release_path
def it_defaults_to_latest_version_if_no_version_provided( team_dataset_path: Path): latest_release_path = team_dataset_path / "releases" / "latest" latest_release_path.mkdir(parents=True) assert get_release_path(team_dataset_path) == latest_release_path
def split_dataset( dataset_path: PathLike, release_name: Optional[str] = None, val_percentage: float = 0.1, test_percentage: float = 0.2, split_seed: int = 0, make_default_split: bool = True, stratified_types: List[str] = ["bounding_box", "polygon", "tag"], ) -> Path: """ Given a local a dataset (pulled from Darwin), split it by creating lists of filenames. The partitions to split the dataset into are called train, val and test. The dataset is always split randomly, and can be additionally split according to the stratified strategy by providing a list of stratified types. Parameters ---------- dataset_path : Path Local path to the dataset release_name : str Version of the dataset val_percentage : float Percentage of images used in the validation set test_percentage : float Percentage of images used in the test set split_seed : int Fix seed for random split creation make_default_split : bool Makes this split the default split stratified_types : List[str] List of annotation types to split with the stratified strategy Returns ------- splits : Path Keys are the different splits (random, tags, ...) and values are the relative file names """ # Requirements: scikit-learn try: import sklearn # noqa except ImportError: raise ImportError( "Darwin requires scikit-learn to split a dataset. Install it using: pip install scikit-learn" ) from None _validate_split(val_percentage, test_percentage) # Infer release path if isinstance(dataset_path, str): dataset_path = Path(dataset_path) release_path = get_release_path(dataset_path, release_name) # List all annotation files in release annotation_path = release_path / "annotations" assert annotation_path.exists() annotation_files = list(annotation_path.glob("**/*.json")) # Prepare the "lists" folder, which is where we are going to save the split files lists_path = release_path / "lists" lists_path.mkdir(parents=True, exist_ok=True) # Compute sizes of each dataset partition dataset_size: int = len(annotation_files) val_size: int = int(val_percentage * dataset_size) test_size: int = int(test_percentage * dataset_size) train_size: int = dataset_size - val_size - test_size split_id = f"{train_size}_{val_size}_{test_size}" # Compute split id, a combination of val precentage, test percentage and split seed # The split id is used to create a folder with the same name in the "lists" folder if split_seed != 0: split_id += f"_s{split_seed}" split_path = lists_path / split_id # Build a split paths dictionary. The split paths are indexed by strategy (e.g. random # or stratified), and by partition (train/val/test) split = _build_split(split_path, stratified_types) assert split.is_valid() # Do the actual splitting split_path.mkdir(exist_ok=True) if split.random: _random_split( annotation_path=annotation_path, annotation_files=annotation_files, split=split.random, train_size=train_size, val_size=val_size, test_size=test_size, split_seed=split_seed, ) if split.stratified: _stratified_split( annotation_path=annotation_path, split=split.stratified, annotation_files=annotation_files, train_size=train_size, val_size=val_size, test_size=test_size, stratified_types=stratified_types, split_seed=split_seed, ) # Create symlink for default split default_split_path = lists_path / "default" if make_default_split or not default_split_path.exists(): if default_split_path.exists(): default_split_path.unlink() default_split_path.symlink_to(f"./{split_id}") return split_path