def _load_client( team_slug: Optional[str] = None, offline: bool = False, maybe_guest: bool = False, dataset_identifier: Optional[str] = None, ): """Fetches a client, potentially offline Parameters ---------- offline : bool Flag for using an offline client maybe_guest : bool Flag to make a guest client, if config is missing Returns ------- Client The client requested """ if not team_slug and dataset_identifier: team_slug = DatasetIdentifier.parse(dataset_identifier).team_slug try: config_dir = Path.home() / ".darwin" / "config.yaml" client = Client.from_config(config_dir, team_slug=team_slug) return client except MissingConfig: if maybe_guest: return Client.from_guest() else: _error("Authenticate first") except InvalidLogin: _error("Please re-authenticate") except Unauthenticated: _error("Please re-authenticate")
def pull_dataset(dataset_slug: str): """Downloads a remote dataset (images and annotations) in the datasets directory. Parameters ---------- dataset_slug: str Slug of the dataset to which we perform the operation on """ version = DatasetIdentifier.parse(dataset_slug).version or "latest" client = _load_client(offline=False) try: dataset = client.get_remote_dataset(dataset_identifier=dataset_slug) except NotFound: _error(f"dataset '{dataset_slug}' does not exist at {client.url}. " f"Use 'darwin remote' to list all the remote datasets.") except Unauthenticated: _error(f"please re-authenticate") try: release = dataset.get_release(version) dataset.pull(release=release) except NotFound: _error( f"Version '{dataset.identifier}:{version}' does not exist " f"Use 'darwin dataset releases' to list all available versions.") print(f"Dataset {release.identifier} downloaded at {dataset.local_path}. ")
def get_dataset( dataset_slug: str, dataset_type: str, partition: Optional[str] = None, split: str = "default", split_type: str = "random", transform: Optional[List] = None, ): """ Creates and returns a dataset Parameters ---------- dataset_slug: str Slug of the dataset to retrieve dataset_type: str The type of dataset [classification, instance-segmentation, semantic-segmentation] partition: str Selects one of the partitions [train, val, test, None]. (Default: None) split: str Selects the split that defines the percentages used. (Default: 'default') split_type: str Heuristic used to do the split [random, stratified]. (Default: 'random') transform : list[torchvision.transforms] List of PyTorch transforms """ dataset_functions = { "classification": ClassificationDataset, "instance-segmentation": InstanceSegmentationDataset, "semantic-segmentation": SemanticSegmentationDataset, } dataset_function = dataset_functions.get(dataset_type) if not dataset_function: list_of_types = ", ".join(dataset_functions.keys()) _error(f"dataset_type needs to be one of '{list_of_types}'") identifier = DatasetIdentifier.parse(dataset_slug) client = _load_client(offline=True) for p in client.list_local_datasets(team=identifier.team_slug): if identifier.dataset_slug == p.name: return dataset_function( dataset_path=p, partition=partition, split=split, split_type=split_type, release_name=identifier.version, transform=transform, ) for p in client.list_deprecated_local_datasets(team=identifier.team_slug): if identifier.dataset_slug == p.name: _error( f"Found a local version of the dataset {identifier.dataset_slug} which uses a deprecated format. " f"Run `darwin dataset migrate {identifier}` if you want to be able to use it in darwin-py." ) _error(f"Dataset '{identifier.dataset_slug}' does not exist locally. " f"Use 'darwin dataset remote' to see all the available datasets, " f"and 'darwin dataset pull' to pull them.")
def path(dataset_slug: str) -> Path: """ Returns the absolute path of the specified dataset. Exits the application if the dataset does not exist locally. Parameters ---------- dataset_slug: str The dataset's slug. Returns ------- Path The absolute path of the dataset. """ identifier: DatasetIdentifier = DatasetIdentifier.parse(dataset_slug) client: Client = _load_client(offline=True) for path in client.list_local_datasets(team_slug=identifier.team_slug): if identifier.dataset_slug == path.name: return path _error(f"Dataset '{identifier.dataset_slug}' does not exist locally. " f"Use 'darwin dataset remote' to see all the available datasets, " f"and 'darwin dataset pull' to pull them.")
def export_dataset(dataset_slug: str, include_url_token: bool, name: str, annotation_class_ids: Optional[List[str]] = None) -> None: """ Create a new release for the dataset. Parameters ---------- dataset_slug: str Slug of the dataset to which we perform the operation on. include_url_token: bool If True includes the url token, if False does not. name: str Name of the release. annotation_class_ids: Optional[List[str]] List of the classes to filter. Defautls to None. """ client: Client = _load_client(offline=False) identifier: DatasetIdentifier = DatasetIdentifier.parse(dataset_slug) ds: RemoteDataset = client.get_remote_dataset(identifier) ds.export(annotation_class_ids=annotation_class_ids, name=name, include_url_token=include_url_token) identifier.version = name print(f"Dataset {dataset_slug} successfully exported to {identifier}") print_new_version_info(client)
def get_remote_dataset( self, dataset_identifier: Union[str, DatasetIdentifier]) -> RemoteDataset: """Get a remote dataset based on the parameter passed. You can only choose one of the possible parameters and calling this method with multiple ones will result in an error. Parameters ---------- dataset_identifier : int ID of the dataset to return Returns ------- RemoteDataset Initialized dataset """ if isinstance(dataset_identifier, str): dataset_identifier = DatasetIdentifier.parse(dataset_identifier) if not dataset_identifier.team_slug: dataset_identifier.team_slug = self.default_team matching_datasets = [ dataset for dataset in self.list_remote_datasets( team=dataset_identifier.team_slug) if dataset.slug == dataset_identifier.dataset_slug ] if not matching_datasets: raise NotFound(dataset_identifier) return matching_datasets[0]
def get_dataset_files(dataset_slug) -> Iterator[DatasetItem]: """ Return a list of filenames in a dataset along with their status """ client = Client.from_api_key(API_KEY) identifier = DatasetIdentifier.parse(dataset_slug) dataset = client.get_remote_dataset(dataset_identifier=identifier) resp = dataset.fetch_remote_files() return resp
def add_labels_to_dataset(dataset_slug: str, labels: List[str], label_type: str): """ Add labels to a dataset """ assert label_type in ['polygon', 'tag'] client = Client.from_api_key(API_KEY) identifier = DatasetIdentifier.parse(dataset_slug) dataset = client.get_remote_dataset(dataset_identifier=identifier) for label in labels: dataset.create_annotation_class(label, label_type)
def populate_dataset_annotations(dataset_slug, format_name: str, file_paths: List[str]): assert format_name in ['darwin', 'coco', 'pascal_voc'] client = Client.from_api_key(API_KEY) identifier = DatasetIdentifier.parse(dataset_slug) dataset = client.get_remote_dataset(dataset_identifier=identifier) format_dict = {k: v for (k, v) in darwin.importer.formats.supported_formats} parser = format_dict[format_name] importer.import_annotations(dataset, parser, file_paths)
def migrate_dataset(dataset_slug: str): """Migrates an outdated local dataset to the latest format. Parameters ---------- dataset_slug: str Slug of the dataset to which we perform the operation on """ identifier = DatasetIdentifier.parse(dataset_slug) if not identifier.team_slug: _error( "Team name missing.\nUsage: darwin dataset migrate <team-name>/<dataset-name>" ) client = _load_client(offline=True) authenticated_teams = [e["slug"] for e in client.config.get_all_teams()] if identifier.team_slug not in authenticated_teams: _error( f"Could not find '{identifier.team_slug}' in the authenticated teams. " "Run 'darwin authenticate' to authenticate it.") for p in client.list_local_datasets(team=identifier.team_slug): if identifier.dataset_slug == p.name: print(f"Dataset '{dataset_slug}' already migrated.") return old_path = None for p in client.list_deprecated_local_datasets(identifier.team_slug): if identifier.dataset_slug == p.name: old_path = p if not old_path: _error( f"Could not find a deprecated local version of the dataset '{dataset_slug}'. " f"Use 'darwin dataset pull {dataset_slug}' to pull the latest version from darwin." ) # Move the dataset under the team_slug folder team_config = client.config.get_team(identifier.team_slug) team_path = Path(team_config["datasets_dir"]) / identifier.team_slug team_path.mkdir(exist_ok=True) shutil.move(str(old_path), str(team_path)) # Update internal structure dataset_path = team_path / old_path.name release_path = dataset_path / "releases/migrated" for p in ["annotations", "lists"]: if (dataset_path / p).exists(): shutil.move(str(dataset_path / p), str(release_path / p)) latest_release = dataset_path / "releases/latest" if latest_release.exists(): latest_release.unlink() latest_release.symlink_to("./migrated") print(f"Dataset {identifier.dataset_slug} migrated to {dataset_path}.")
def get_remote_dataset( self, dataset_identifier: Union[str, DatasetIdentifier]) -> RemoteDataset: """Get a remote dataset based on the parameter passed. You can only choose one of the possible parameters and calling this method with multiple ones will result in an error. Parameters ---------- dataset_identifier : int ID of the dataset to return Returns ------- RemoteDataset Initialized dataset """ if isinstance(dataset_identifier, str): dataset_identifier = DatasetIdentifier.parse(dataset_identifier) if not dataset_identifier.team_slug: dataset_identifier.team_slug = self.default_team try: matching_datasets = [ dataset for dataset in self.list_remote_datasets( team=dataset_identifier.team_slug) if dataset.slug == dataset_identifier.dataset_slug ] except Unauthorized: # There is a chance that we tried to access an open dataset dataset = self.get( f"{dataset_identifier.team_slug}/{dataset_identifier.dataset_slug}" ) # If there isn't a record of this team, create one. if not self.config.get_team(dataset_identifier.team_slug, raise_on_invalid_team=False): datasets_dir = Path.home() / ".darwin" / "datasets" self.config.set_team(team=dataset_identifier.team_slug, api_key="", datasets_dir=str(datasets_dir)) return RemoteDataset( name=dataset["name"], slug=dataset["slug"], team=dataset_identifier.team_slug, dataset_id=dataset["id"], image_count=dataset["num_images"], progress=0, client=self, ) if not matching_datasets: raise NotFound(dataset_identifier) return matching_datasets[0]
def create_dataset(dataset_slug: str): """Creates a dataset remotely""" identifier = DatasetIdentifier.parse(dataset_slug) client = _load_client(team_slug=identifier.team_slug) try: dataset = client.create_dataset(name=identifier.dataset_slug) print( f"Dataset '{dataset.name}' ({dataset.team}/{dataset.slug}) has been created.\nAccess at {dataset.remote_path}" ) except NameTaken: _error(f"Dataset name '{identifier.dataset_slug}' is already taken.") except ValidationError: _error(f"Dataset name '{identifier.dataset_slug}' is not valid.")
def path(dataset_slug: str) -> Path: """Returns the absolute path of the specified dataset, if synced""" identifier = DatasetIdentifier.parse(dataset_slug) client = _load_client(offline=True) try: for p in client.list_local_datasets(team=identifier.team_slug): if identifier.dataset_slug == p.name: return p except NotFound as e: _error( f"Dataset '{e.name}' does not exist locally. " f"Use 'darwin dataset remote' to see all the available datasets, " f"and 'darwin dataset pull' to pull them.")
def create_dataset(dataset_slug): """ Create new empty dataset """ client = Client.from_api_key(API_KEY) identifier = DatasetIdentifier.parse(dataset_slug) dataset = client.create_dataset(name=identifier.dataset_slug) dataset_ifo = dict( name=dataset.name, id=dataset.dataset_id, slug=dataset.slug, remote_path=dataset.remote_path ) return dataset_ifo
def pull_dataset(dataset_slug: str, only_annotations: bool = False, folders: bool = False, video_frames: bool = False) -> None: """ Downloads a remote dataset (images and annotations) in the datasets directory. Exits the application if dataset is not found, the user is not authenticated, there are no releases or the export format for the latest release is not supported. Parameters ---------- dataset_slug: str Slug of the dataset to which we perform the operation on. only_annotations: bool Download only the annotations and no corresponding images. Defaults to False. folders: bool Recreates the folders in the dataset. Defaults to False. video_frames: bool Pulls video frames images instead of video files. Defaults to False. """ version: str = DatasetIdentifier.parse(dataset_slug).version or "latest" client: Client = _load_client(offline=False, maybe_guest=True) try: dataset: RemoteDataset = client.get_remote_dataset( dataset_identifier=dataset_slug) except NotFound: _error( f"Dataset '{dataset_slug}' does not exist, please check the spelling. " f"Use 'darwin remote' to list all the remote datasets.") except Unauthenticated: _error(f"please re-authenticate") try: release: Release = dataset.get_release(version) dataset.pull(release=release, only_annotations=only_annotations, use_folders=folders, video_frames=video_frames) print_new_version_info(client) except NotFound: _error( f"Version '{dataset.identifier}:{version}' does not exist " f"Use 'darwin dataset releases' to list all available versions.") except UnsupportedExportFormat as uef: _error( f"Version '{dataset.identifier}:{version}' is of format '{uef.format}', " f"only the darwin format ('json') is supported for `darwin dataset pull`" ) print(f"Dataset {release.identifier} downloaded at {dataset.local_path}. ")
def dataset_convert(dataset_identifier: str, format: str, output_dir: Optional[PathLike] = None) -> None: """ Converts the annotations from the given dataset to the given format. Exits the application if no dataset with the given slug exists or no releases for the dataset were previously pulled. Parameters ---------- dataset_identifier: str The dataset identifier, normally in the "<team-slug>/<dataset-slug>:<version>" form. format: str The format we want to convert to. output_dir: Optional[PathLike] The folder where the exported annotation files will be. If None it will be the inside the annotations folder of the dataset under 'other_formats/{format}'. The Defaults to None. """ identifier: DatasetIdentifier = DatasetIdentifier.parse(dataset_identifier) client: Client = _load_client(team_slug=identifier.team_slug) try: parser: ExportParser = get_exporter(format) dataset: RemoteDataset = client.get_remote_dataset( dataset_identifier=identifier) if not dataset.local_path.exists(): _error( f"No annotations downloaded for dataset f{dataset}, first pull a release using " f"'darwin dataset pull {identifier}'") release_path: Path = get_release_path(dataset.local_path, identifier.version) annotations_path: Path = release_path / "annotations" if output_dir is None: output_dir = release_path / "other_formats" / format else: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) export_annotations(parser, [annotations_path], output_dir) except ExporterNotFoundError: _error( f"Unsupported export format: {format}, currently supported: {export_formats}" ) except AttributeError: _error( f"Unsupported export format: {format}, currently supported: {export_formats}" ) except NotFound as e: _error(f"No dataset with name '{e.name}'")
def run_demo( *, team_slug: Optional[str], dataset_slug: Optional[str] = None, datasets_dir: Optional[str] = None, api_key: Optional[str] = None, config_path: Optional[Path] = None, ): """ Download a Darwin dataset on the file system. Parameters ---------- team_slug : str Slug of the team to select dataset_slug : str This is the dataset name with everything lower-case, removed specials characters and spaces are replaced by dashes, e.g., `bird-species`. This string is unique within a team datasets_dir : Path Path where the client should be initialized from (aka the root path) api_key: str API key to authenticate the client config_path: Path Path to a configuration path which contains the authentication information to use Returns ------- splits : dict Keys are the different splits (random, tags, ...) and values are the relative file names """ # Authenticate the new KEY if available if api_key is not None: authenticate(api_key=api_key, default_team=True, datasets_dir=datasets_dir) # Get the client used to perform remote operations if config_path is not None: client = Client.from_config(config_path=config_path) else: client = Client.local(team_slug=team_slug) # Create a dataset identifier dataset_identifier = DatasetIdentifier.from_slug(dataset_slug=dataset_slug, team_slug=team_slug) # Get an object representing the remote dataset ds = client.get_remote_dataset(dataset_identifier=dataset_identifier) # Download the dataset on the local file system ds.pull() # Split the dataset in train/val/test splits = split_dataset(dataset=ds)
def split(dataset_slug: str, val_percentage: float, test_percentage: float, seed: Optional[int] = 0): """Splits a local version of a dataset into train, validation, and test partitions Parameters ---------- dataset_slug: str Slug of the dataset to which we perform the operation on val_percentage: float Percentage in the validation set test_percentage: float Percentage in the test set seed: int Random seed """ identifier = DatasetIdentifier.parse(dataset_slug) client = _load_client(offline=True) for p in client.list_local_datasets(team=identifier.team_slug): if identifier.dataset_slug == p.name: try: split_path = split_dataset( dataset_path=p, release_name=identifier.version, val_percentage=val_percentage, test_percentage=test_percentage, split_seed=seed, ) print(f"Partition lists saved at {split_path}") return except NotFound as e: _error(e.name) except ValueError as e: _error(e.args[0]) for p in client.list_deprecated_local_datasets(team=identifier.team_slug): if identifier.dataset_slug == p.name: _error( f"Found a local version of the dataset {identifier.dataset_slug} which uses a deprecated format. " f"Run `darwin dataset migrate {identifier}` if you want to be able to use it in darwin-py." ) _error(f"Dataset '{identifier.dataset_slug}' does not exist locally. " f"Use 'darwin dataset remote' to see all the available datasets, " f"and 'darwin dataset pull' to pull them.")
def _populate_dataset(dataset_slug, items): client = Client.from_api_key(API_KEY) identifier = DatasetIdentifier.parse(dataset_slug) dataset = client.get_remote_dataset(dataset_identifier=identifier) item_batches = [x.tolist() for x in np.array_split(items, min(len(items), 100))] for idx, batch in enumerate(item_batches): print(f'Batch {idx + 1}/{len(item_batches)}') payload = { 'files': batch } print(payload) print(dataset.dataset_id) response = requests.put(f'https://darwin.v7labs.com/api/datasets/{dataset.dataset_id}/external_data', headers=HEADERS, json=payload) response.raise_for_status()
def path(dataset_slug: str) -> Path: """Returns the absolute path of the specified dataset, if synced""" identifier = DatasetIdentifier.parse(dataset_slug) client = _load_client(offline=True) for p in client.list_local_datasets(team=identifier.team_slug): if identifier.dataset_slug == p.name: return p for p in client.list_deprecated_local_datasets(team=identifier.team_slug): if identifier.dataset_slug == p.name: _error( f"Found a local version of the dataset {identifier.dataset_slug} which uses a deprecated format. " f"Run `darwin dataset migrate {identifier}` if you want to be able to use it in darwin-py." f"\n{p} (deprecated format)") _error(f"Dataset '{identifier.dataset_slug}' does not exist locally. " f"Use 'darwin dataset remote' to see all the available datasets, " f"and 'darwin dataset pull' to pull them.")
def split(dataset_slug: str, val_percentage: float, test_percentage: float, seed: int = 0) -> None: """ Splits a local version of a dataset into train, validation, and test partitions. Parameters ---------- dataset_slug: str Slug of the dataset to which we perform the operation on. val_percentage: float Percentage in the validation set. test_percentage: float Percentage in the test set. seed: int Random seed. Defaults to 0. """ identifier: DatasetIdentifier = DatasetIdentifier.parse(dataset_slug) client: Client = _load_client(offline=True) for p in client.list_local_datasets(team_slug=identifier.team_slug): if identifier.dataset_slug == p.name: try: split_path = split_dataset( dataset_path=p, release_name=identifier.version, val_percentage=val_percentage, test_percentage=test_percentage, split_seed=seed, ) print(f"Partition lists saved at {split_path}") return except ImportError as e: _error(e.msg) except NotFound as e: _error(e.name) except ValueError as e: _error(e.args[0]) _error(f"Dataset '{identifier.dataset_slug}' does not exist locally. " f"Use 'darwin dataset remote' to see all the available datasets, " f"and 'darwin dataset pull' to pull them.")
def export_dataset(dataset_slug: str, annotation_class_ids: Optional[List] = None, name: Optional[str] = None): """Create a new release for the dataset Parameters ---------- dataset_slug: str Slug of the dataset to which we perform the operation on annotation_class_ids: List List of the classes to filter name: str Name of the release """ client = _load_client(offline=False) identifier = DatasetIdentifier.parse(dataset_slug) ds = client.get_remote_dataset(identifier) ds.export(annotation_class_ids=annotation_class_ids, name=name) identifier.version = name print(f"Dataset {dataset_slug} successfully exported to {identifier}")
def create_dataset(dataset_slug: str) -> None: """ Creates a dataset remotely. Exits the application if the dataset's name is already taken or is not valid. Parameters ---------- dataset_slug : str Slug of the new dataset. """ identifier: DatasetIdentifier = DatasetIdentifier.parse(dataset_slug) client: Client = _load_client(team_slug=identifier.team_slug) try: dataset: RemoteDataset = client.create_dataset( name=identifier.dataset_slug) print( f"Dataset '{dataset.name}' ({dataset.team}/{dataset.slug}) has been created.\nAccess at {dataset.remote_path}" ) print_new_version_info(client) except NameTaken: _error(f"Dataset name '{identifier.dataset_slug}' is already taken.") except ValidationError: _error(f"Dataset name '{identifier.dataset_slug}' is not valid.")
def raises_with_team_only(): with pytest.raises(ValueError): DatasetIdentifier.parse("team/")
def optional_team_with_version(): dataset_identifier = DatasetIdentifier.parse("dataset:1.0") assert dataset_identifier.team_slug is None assert dataset_identifier.dataset_slug == "dataset" assert dataset_identifier.version == "1.0"
def with_numbers(): dataset_identifier = DatasetIdentifier.parse("team1/dataset1") assert dataset_identifier.team_slug == "team1" assert dataset_identifier.dataset_slug == "dataset1" assert dataset_identifier.version is None
def identifier(self) -> DatasetIdentifier: return DatasetIdentifier(team_slug=self.team, dataset_slug=self.slug)
def detectron2_register_dataset( dataset: str, release_name: Optional[str] = "latest", partition: Optional[str] = None, split: Optional[str] = "default", split_type: Optional[str] = "stratified", evaluator_type: Optional[str] = None, ) -> str: """Registers a local Darwin-formatted dataset in Detectron2 Parameters ---------- dataset: str Dataset slug release_name: str Version of the dataset partition: str Selects one of the partitions [train, val, test] split Selects the split that defines the percetages used (use 'default' to select the default split) split_type: str Heuristic used to do the split [random, stratified] evaluator_type: str Evaluator to be used in the val and test sets """ try: from detectron2.data import DatasetCatalog, MetadataCatalog except ImportError: print("Detectron2 not found.") sys.exit(1) from darwin.dataset.utils import get_annotations, get_classes dataset_path: Optional[Path] = None if os.path.isdir(dataset): dataset_path = Path(dataset) else: identifier = DatasetIdentifier.parse(dataset) if identifier.version: release_name = identifier.version client = _load_client(offline=True) dataset_path = None for path in client.list_local_datasets(team_slug=identifier.team_slug): if identifier.dataset_slug == path.name: dataset_path = path if not dataset_path: _error( f"Dataset '{identifier.dataset_slug}' does not exist locally. " f"Use 'darwin dataset remote' to see all the available datasets, " f"and 'darwin dataset pull' to pull them.") catalog_name = f"darwin_{dataset_path.name}" if partition: catalog_name += f"_{partition}" classes = get_classes(dataset_path=dataset_path, release_name=release_name, annotation_type="polygon") DatasetCatalog.register( catalog_name, lambda partition=partition: list( get_annotations( dataset_path, partition=partition, split=split, split_type=split_type, release_name=release_name, annotation_type="polygon", annotation_format="coco", ignore_inconsistent_examples=True, )), ) MetadataCatalog.get(catalog_name).set(thing_classes=classes) if evaluator_type: MetadataCatalog.get(catalog_name).set(evaluator_type=evaluator_type) return catalog_name
def identifier(self) -> DatasetIdentifier: return DatasetIdentifier(team_slug=self.team_slug, dataset_slug=self.dataset_slug, version=self.name)
def with_dashes(): dataset_identifier = DatasetIdentifier.parse("my-team/my-dataset") assert dataset_identifier.team_slug == "my-team" assert dataset_identifier.dataset_slug == "my-dataset" assert dataset_identifier.version is None