예제 #1
0
def _load_client(
    team_slug: Optional[str] = None,
    offline: bool = False,
    maybe_guest: bool = False,
    dataset_identifier: Optional[str] = None,
):
    """Fetches a client, potentially offline

    Parameters
    ----------
    offline : bool
        Flag for using an offline client

    maybe_guest : bool
        Flag to make a guest client, if config is missing
    Returns
    -------
    Client
    The client requested
    """
    if not team_slug and dataset_identifier:
        team_slug = DatasetIdentifier.parse(dataset_identifier).team_slug
    try:
        config_dir = Path.home() / ".darwin" / "config.yaml"
        client = Client.from_config(config_dir, team_slug=team_slug)
        return client
    except MissingConfig:
        if maybe_guest:
            return Client.from_guest()
        else:
            _error("Authenticate first")
    except InvalidLogin:
        _error("Please re-authenticate")
    except Unauthenticated:
        _error("Please re-authenticate")
예제 #2
0
def pull_dataset(dataset_slug: str):
    """Downloads a remote dataset (images and annotations) in the datasets directory.

    Parameters
    ----------
    dataset_slug: str
        Slug of the dataset to which we perform the operation on
    """
    version = DatasetIdentifier.parse(dataset_slug).version or "latest"
    client = _load_client(offline=False)
    try:
        dataset = client.get_remote_dataset(dataset_identifier=dataset_slug)
    except NotFound:
        _error(f"dataset '{dataset_slug}' does not exist at {client.url}. "
               f"Use 'darwin remote' to list all the remote datasets.")
    except Unauthenticated:
        _error(f"please re-authenticate")
    try:
        release = dataset.get_release(version)
        dataset.pull(release=release)
    except NotFound:
        _error(
            f"Version '{dataset.identifier}:{version}' does not exist "
            f"Use 'darwin dataset releases' to list all available versions.")
    print(f"Dataset {release.identifier} downloaded at {dataset.local_path}. ")
예제 #3
0
def get_dataset(
    dataset_slug: str,
    dataset_type: str,
    partition: Optional[str] = None,
    split: str = "default",
    split_type: str = "random",
    transform: Optional[List] = None,
):
    """
    Creates and returns a dataset

    Parameters
    ----------
    dataset_slug: str
        Slug of the dataset to retrieve
    dataset_type: str
        The type of dataset [classification, instance-segmentation, semantic-segmentation]
    partition: str
        Selects one of the partitions [train, val, test, None]. (Default: None)
    split: str
        Selects the split that defines the percentages used. (Default: 'default')
    split_type: str
        Heuristic used to do the split [random, stratified]. (Default: 'random')
    transform : list[torchvision.transforms]
        List of PyTorch transforms
    """
    dataset_functions = {
        "classification": ClassificationDataset,
        "instance-segmentation": InstanceSegmentationDataset,
        "semantic-segmentation": SemanticSegmentationDataset,
    }
    dataset_function = dataset_functions.get(dataset_type)
    if not dataset_function:
        list_of_types = ", ".join(dataset_functions.keys())
        _error(f"dataset_type needs to be one of '{list_of_types}'")

    identifier = DatasetIdentifier.parse(dataset_slug)
    client = _load_client(offline=True)

    for p in client.list_local_datasets(team=identifier.team_slug):
        if identifier.dataset_slug == p.name:
            return dataset_function(
                dataset_path=p,
                partition=partition,
                split=split,
                split_type=split_type,
                release_name=identifier.version,
                transform=transform,
            )

    for p in client.list_deprecated_local_datasets(team=identifier.team_slug):
        if identifier.dataset_slug == p.name:
            _error(
                f"Found a local version of the dataset {identifier.dataset_slug} which uses a deprecated format. "
                f"Run `darwin dataset migrate {identifier}` if you want to be able to use it in darwin-py."
            )

    _error(f"Dataset '{identifier.dataset_slug}' does not exist locally. "
           f"Use 'darwin dataset remote' to see all the available datasets, "
           f"and 'darwin dataset pull' to pull them.")
예제 #4
0
def path(dataset_slug: str) -> Path:
    """
    Returns the absolute path of the specified dataset.
    Exits the application if the dataset does not exist locally.

    Parameters
    ----------
    dataset_slug: str
        The dataset's slug.

    Returns
    -------
    Path
        The absolute path of the dataset.
    """
    identifier: DatasetIdentifier = DatasetIdentifier.parse(dataset_slug)
    client: Client = _load_client(offline=True)

    for path in client.list_local_datasets(team_slug=identifier.team_slug):
        if identifier.dataset_slug == path.name:
            return path

    _error(f"Dataset '{identifier.dataset_slug}' does not exist locally. "
           f"Use 'darwin dataset remote' to see all the available datasets, "
           f"and 'darwin dataset pull' to pull them.")
예제 #5
0
def export_dataset(dataset_slug: str,
                   include_url_token: bool,
                   name: str,
                   annotation_class_ids: Optional[List[str]] = None) -> None:
    """
    Create a new release for the dataset.

    Parameters
    ----------
    dataset_slug: str
        Slug of the dataset to which we perform the operation on.
    include_url_token: bool
        If True includes the url token, if False does not.
    name: str
        Name of the release.
    annotation_class_ids: Optional[List[str]]
        List of the classes to filter. Defautls to None.
    """
    client: Client = _load_client(offline=False)
    identifier: DatasetIdentifier = DatasetIdentifier.parse(dataset_slug)
    ds: RemoteDataset = client.get_remote_dataset(identifier)
    ds.export(annotation_class_ids=annotation_class_ids,
              name=name,
              include_url_token=include_url_token)
    identifier.version = name
    print(f"Dataset {dataset_slug} successfully exported to {identifier}")
    print_new_version_info(client)
예제 #6
0
    def get_remote_dataset(
            self,
            dataset_identifier: Union[str,
                                      DatasetIdentifier]) -> RemoteDataset:
        """Get a remote dataset based on the parameter passed. You can only choose one of the
        possible parameters and calling this method with multiple ones will result in an
        error.

        Parameters
        ----------
        dataset_identifier : int
            ID of the dataset to return

        Returns
        -------
        RemoteDataset
            Initialized dataset
        """
        if isinstance(dataset_identifier, str):
            dataset_identifier = DatasetIdentifier.parse(dataset_identifier)
        if not dataset_identifier.team_slug:
            dataset_identifier.team_slug = self.default_team

        matching_datasets = [
            dataset for dataset in self.list_remote_datasets(
                team=dataset_identifier.team_slug)
            if dataset.slug == dataset_identifier.dataset_slug
        ]
        if not matching_datasets:
            raise NotFound(dataset_identifier)
        return matching_datasets[0]
예제 #7
0
def get_dataset_files(dataset_slug) -> Iterator[DatasetItem]:
    """ Return a list of filenames in a dataset along with their status """
    client = Client.from_api_key(API_KEY)
    identifier = DatasetIdentifier.parse(dataset_slug)
    dataset = client.get_remote_dataset(dataset_identifier=identifier)
    resp = dataset.fetch_remote_files()

    return resp
예제 #8
0
def add_labels_to_dataset(dataset_slug: str, labels: List[str], label_type: str):
    """ Add labels to a dataset """
    assert label_type in ['polygon', 'tag']
    client = Client.from_api_key(API_KEY)
    identifier = DatasetIdentifier.parse(dataset_slug)
    dataset = client.get_remote_dataset(dataset_identifier=identifier)

    for label in labels:
        dataset.create_annotation_class(label, label_type)
예제 #9
0
def populate_dataset_annotations(dataset_slug, format_name: str, file_paths: List[str]):
    assert format_name in ['darwin', 'coco', 'pascal_voc']
    client = Client.from_api_key(API_KEY)
    identifier = DatasetIdentifier.parse(dataset_slug)
    dataset = client.get_remote_dataset(dataset_identifier=identifier)

    format_dict = {k: v for (k, v) in darwin.importer.formats.supported_formats}
    parser = format_dict[format_name]

    importer.import_annotations(dataset, parser, file_paths)
예제 #10
0
def migrate_dataset(dataset_slug: str):
    """Migrates an outdated local dataset to the latest format.

    Parameters
    ----------
    dataset_slug: str
        Slug of the dataset to which we perform the operation on
    """
    identifier = DatasetIdentifier.parse(dataset_slug)
    if not identifier.team_slug:
        _error(
            "Team name missing.\nUsage: darwin dataset migrate <team-name>/<dataset-name>"
        )

    client = _load_client(offline=True)
    authenticated_teams = [e["slug"] for e in client.config.get_all_teams()]
    if identifier.team_slug not in authenticated_teams:
        _error(
            f"Could not find '{identifier.team_slug}' in the authenticated teams. "
            "Run 'darwin authenticate' to authenticate it.")

    for p in client.list_local_datasets(team=identifier.team_slug):
        if identifier.dataset_slug == p.name:
            print(f"Dataset '{dataset_slug}' already migrated.")
            return

    old_path = None
    for p in client.list_deprecated_local_datasets(identifier.team_slug):
        if identifier.dataset_slug == p.name:
            old_path = p
    if not old_path:
        _error(
            f"Could not find a deprecated local version of the dataset '{dataset_slug}'. "
            f"Use 'darwin dataset pull {dataset_slug}' to pull the latest version from darwin."
        )

    # Move the dataset under the team_slug folder
    team_config = client.config.get_team(identifier.team_slug)
    team_path = Path(team_config["datasets_dir"]) / identifier.team_slug
    team_path.mkdir(exist_ok=True)
    shutil.move(str(old_path), str(team_path))

    # Update internal structure
    dataset_path = team_path / old_path.name
    release_path = dataset_path / "releases/migrated"
    for p in ["annotations", "lists"]:
        if (dataset_path / p).exists():
            shutil.move(str(dataset_path / p), str(release_path / p))

    latest_release = dataset_path / "releases/latest"
    if latest_release.exists():
        latest_release.unlink()
    latest_release.symlink_to("./migrated")

    print(f"Dataset {identifier.dataset_slug} migrated to {dataset_path}.")
예제 #11
0
    def get_remote_dataset(
            self,
            dataset_identifier: Union[str,
                                      DatasetIdentifier]) -> RemoteDataset:
        """Get a remote dataset based on the parameter passed. You can only choose one of the
        possible parameters and calling this method with multiple ones will result in an
        error.

        Parameters
        ----------
        dataset_identifier : int
            ID of the dataset to return

        Returns
        -------
        RemoteDataset
            Initialized dataset
        """
        if isinstance(dataset_identifier, str):
            dataset_identifier = DatasetIdentifier.parse(dataset_identifier)
        if not dataset_identifier.team_slug:
            dataset_identifier.team_slug = self.default_team

        try:
            matching_datasets = [
                dataset for dataset in self.list_remote_datasets(
                    team=dataset_identifier.team_slug)
                if dataset.slug == dataset_identifier.dataset_slug
            ]
        except Unauthorized:
            # There is a chance that we tried to access an open dataset
            dataset = self.get(
                f"{dataset_identifier.team_slug}/{dataset_identifier.dataset_slug}"
            )

            # If there isn't a record of this team, create one.
            if not self.config.get_team(dataset_identifier.team_slug,
                                        raise_on_invalid_team=False):
                datasets_dir = Path.home() / ".darwin" / "datasets"
                self.config.set_team(team=dataset_identifier.team_slug,
                                     api_key="",
                                     datasets_dir=str(datasets_dir))

            return RemoteDataset(
                name=dataset["name"],
                slug=dataset["slug"],
                team=dataset_identifier.team_slug,
                dataset_id=dataset["id"],
                image_count=dataset["num_images"],
                progress=0,
                client=self,
            )
        if not matching_datasets:
            raise NotFound(dataset_identifier)
        return matching_datasets[0]
예제 #12
0
def create_dataset(dataset_slug: str):
    """Creates a dataset remotely"""
    identifier = DatasetIdentifier.parse(dataset_slug)
    client = _load_client(team_slug=identifier.team_slug)
    try:
        dataset = client.create_dataset(name=identifier.dataset_slug)
        print(
            f"Dataset '{dataset.name}' ({dataset.team}/{dataset.slug}) has been created.\nAccess at {dataset.remote_path}"
        )
    except NameTaken:
        _error(f"Dataset name '{identifier.dataset_slug}' is already taken.")
    except ValidationError:
        _error(f"Dataset name '{identifier.dataset_slug}' is not valid.")
예제 #13
0
def path(dataset_slug: str) -> Path:
    """Returns the absolute path of the specified dataset, if synced"""
    identifier = DatasetIdentifier.parse(dataset_slug)
    client = _load_client(offline=True)
    try:
        for p in client.list_local_datasets(team=identifier.team_slug):
            if identifier.dataset_slug == p.name:
                return p
    except NotFound as e:
        _error(
            f"Dataset '{e.name}' does not exist locally. "
            f"Use 'darwin dataset remote' to see all the available datasets, "
            f"and 'darwin dataset pull' to pull them.")
예제 #14
0
def create_dataset(dataset_slug):
    """ Create new empty dataset """
    client = Client.from_api_key(API_KEY)
    identifier = DatasetIdentifier.parse(dataset_slug)
    dataset = client.create_dataset(name=identifier.dataset_slug)

    dataset_ifo = dict(
        name=dataset.name,
        id=dataset.dataset_id,
        slug=dataset.slug,
        remote_path=dataset.remote_path
    )
    return dataset_ifo
예제 #15
0
def pull_dataset(dataset_slug: str,
                 only_annotations: bool = False,
                 folders: bool = False,
                 video_frames: bool = False) -> None:
    """
    Downloads a remote dataset (images and annotations) in the datasets directory.
    Exits the application if dataset is not found, the user is not authenticated, there are no
    releases or the export format for the latest release is not supported.

    Parameters
    ----------
    dataset_slug: str
        Slug of the dataset to which we perform the operation on.
    only_annotations: bool
        Download only the annotations and no corresponding images. Defaults to False.
    folders: bool
        Recreates the folders in the dataset. Defaults to False.
    video_frames: bool
        Pulls video frames images instead of video files. Defaults to False.
    """
    version: str = DatasetIdentifier.parse(dataset_slug).version or "latest"
    client: Client = _load_client(offline=False, maybe_guest=True)
    try:
        dataset: RemoteDataset = client.get_remote_dataset(
            dataset_identifier=dataset_slug)
    except NotFound:
        _error(
            f"Dataset '{dataset_slug}' does not exist, please check the spelling. "
            f"Use 'darwin remote' to list all the remote datasets.")
    except Unauthenticated:
        _error(f"please re-authenticate")

    try:
        release: Release = dataset.get_release(version)
        dataset.pull(release=release,
                     only_annotations=only_annotations,
                     use_folders=folders,
                     video_frames=video_frames)
        print_new_version_info(client)
    except NotFound:
        _error(
            f"Version '{dataset.identifier}:{version}' does not exist "
            f"Use 'darwin dataset releases' to list all available versions.")
    except UnsupportedExportFormat as uef:
        _error(
            f"Version '{dataset.identifier}:{version}' is of format '{uef.format}', "
            f"only the darwin format ('json') is supported for `darwin dataset pull`"
        )

    print(f"Dataset {release.identifier} downloaded at {dataset.local_path}. ")
예제 #16
0
def dataset_convert(dataset_identifier: str,
                    format: str,
                    output_dir: Optional[PathLike] = None) -> None:
    """
    Converts the annotations from the given dataset to the given format.
    Exits the application if no dataset with the given slug exists or no releases for the dataset
    were previously pulled.

    Parameters
    ----------
    dataset_identifier: str
        The dataset identifier, normally in the "<team-slug>/<dataset-slug>:<version>" form.
    format: str
        The format we want to convert to.
    output_dir: Optional[PathLike]
        The folder where the exported annotation files will be. If None it will be the inside the
        annotations folder of the dataset under 'other_formats/{format}'. The Defaults to None.
    """
    identifier: DatasetIdentifier = DatasetIdentifier.parse(dataset_identifier)
    client: Client = _load_client(team_slug=identifier.team_slug)

    try:
        parser: ExportParser = get_exporter(format)
        dataset: RemoteDataset = client.get_remote_dataset(
            dataset_identifier=identifier)
        if not dataset.local_path.exists():
            _error(
                f"No annotations downloaded for dataset f{dataset}, first pull a release using "
                f"'darwin dataset pull {identifier}'")

        release_path: Path = get_release_path(dataset.local_path,
                                              identifier.version)
        annotations_path: Path = release_path / "annotations"
        if output_dir is None:
            output_dir = release_path / "other_formats" / format
        else:
            output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

        export_annotations(parser, [annotations_path], output_dir)
    except ExporterNotFoundError:
        _error(
            f"Unsupported export format: {format}, currently supported: {export_formats}"
        )
    except AttributeError:
        _error(
            f"Unsupported export format: {format}, currently supported: {export_formats}"
        )
    except NotFound as e:
        _error(f"No dataset with name '{e.name}'")
예제 #17
0
def run_demo(
    *,
    team_slug: Optional[str],
    dataset_slug: Optional[str] = None,
    datasets_dir: Optional[str] = None,
    api_key: Optional[str] = None,
    config_path: Optional[Path] = None,
):
    """
    Download a Darwin dataset on the file system.

    Parameters
    ----------
    team_slug : str
        Slug of the team to select
    dataset_slug : str
        This is the dataset name with everything lower-case, removed specials characters and
        spaces are replaced by dashes, e.g., `bird-species`. This string is unique within a team
    datasets_dir : Path
        Path where the client should be initialized from (aka the root path)
    api_key: str
        API key to authenticate the client
    config_path: Path
        Path to a configuration path which contains the authentication information to use

    Returns
    -------
    splits : dict
        Keys are the different splits (random, tags, ...) and values are the relative file names
    """
    # Authenticate the new KEY if available
    if api_key is not None:
        authenticate(api_key=api_key,
                     default_team=True,
                     datasets_dir=datasets_dir)
    # Get the client used to perform remote operations
    if config_path is not None:
        client = Client.from_config(config_path=config_path)
    else:
        client = Client.local(team_slug=team_slug)
    # Create a dataset identifier
    dataset_identifier = DatasetIdentifier.from_slug(dataset_slug=dataset_slug,
                                                     team_slug=team_slug)
    # Get an object representing the remote dataset
    ds = client.get_remote_dataset(dataset_identifier=dataset_identifier)
    # Download the dataset on the local file system
    ds.pull()
    # Split the dataset in train/val/test
    splits = split_dataset(dataset=ds)
예제 #18
0
def split(dataset_slug: str,
          val_percentage: float,
          test_percentage: float,
          seed: Optional[int] = 0):
    """Splits a local version of a dataset into train, validation, and test partitions

    Parameters
    ----------
    dataset_slug: str
        Slug of the dataset to which we perform the operation on
    val_percentage: float
        Percentage in the validation set
    test_percentage: float
        Percentage in the test set
    seed: int
        Random seed
    """
    identifier = DatasetIdentifier.parse(dataset_slug)
    client = _load_client(offline=True)

    for p in client.list_local_datasets(team=identifier.team_slug):
        if identifier.dataset_slug == p.name:
            try:
                split_path = split_dataset(
                    dataset_path=p,
                    release_name=identifier.version,
                    val_percentage=val_percentage,
                    test_percentage=test_percentage,
                    split_seed=seed,
                )
                print(f"Partition lists saved at {split_path}")
                return
            except NotFound as e:
                _error(e.name)
            except ValueError as e:
                _error(e.args[0])

    for p in client.list_deprecated_local_datasets(team=identifier.team_slug):
        if identifier.dataset_slug == p.name:
            _error(
                f"Found a local version of the dataset {identifier.dataset_slug} which uses a deprecated format. "
                f"Run `darwin dataset migrate {identifier}` if you want to be able to use it in darwin-py."
            )

    _error(f"Dataset '{identifier.dataset_slug}' does not exist locally. "
           f"Use 'darwin dataset remote' to see all the available datasets, "
           f"and 'darwin dataset pull' to pull them.")
예제 #19
0
def _populate_dataset(dataset_slug, items):
    client = Client.from_api_key(API_KEY)
    identifier = DatasetIdentifier.parse(dataset_slug)
    dataset = client.get_remote_dataset(dataset_identifier=identifier)

    item_batches = [x.tolist() for x in np.array_split(items, min(len(items), 100))]
    for idx, batch in enumerate(item_batches):
        print(f'Batch {idx + 1}/{len(item_batches)}')
        payload = {
            'files': batch
        }
        print(payload)
        print(dataset.dataset_id)
        response = requests.put(f'https://darwin.v7labs.com/api/datasets/{dataset.dataset_id}/external_data', headers=HEADERS,
                                json=payload)

        response.raise_for_status()
예제 #20
0
def path(dataset_slug: str) -> Path:
    """Returns the absolute path of the specified dataset, if synced"""
    identifier = DatasetIdentifier.parse(dataset_slug)
    client = _load_client(offline=True)

    for p in client.list_local_datasets(team=identifier.team_slug):
        if identifier.dataset_slug == p.name:
            return p

    for p in client.list_deprecated_local_datasets(team=identifier.team_slug):
        if identifier.dataset_slug == p.name:
            _error(
                f"Found a local version of the dataset {identifier.dataset_slug} which uses a deprecated format. "
                f"Run `darwin dataset migrate {identifier}` if you want to be able to use it in darwin-py."
                f"\n{p} (deprecated format)")

    _error(f"Dataset '{identifier.dataset_slug}' does not exist locally. "
           f"Use 'darwin dataset remote' to see all the available datasets, "
           f"and 'darwin dataset pull' to pull them.")
예제 #21
0
def split(dataset_slug: str,
          val_percentage: float,
          test_percentage: float,
          seed: int = 0) -> None:
    """
    Splits a local version of a dataset into train, validation, and test partitions.

    Parameters
    ----------
    dataset_slug: str
        Slug of the dataset to which we perform the operation on.
    val_percentage: float
        Percentage in the validation set.
    test_percentage: float
        Percentage in the test set.
    seed: int
        Random seed. Defaults to 0.
    """
    identifier: DatasetIdentifier = DatasetIdentifier.parse(dataset_slug)
    client: Client = _load_client(offline=True)

    for p in client.list_local_datasets(team_slug=identifier.team_slug):
        if identifier.dataset_slug == p.name:
            try:
                split_path = split_dataset(
                    dataset_path=p,
                    release_name=identifier.version,
                    val_percentage=val_percentage,
                    test_percentage=test_percentage,
                    split_seed=seed,
                )
                print(f"Partition lists saved at {split_path}")
                return
            except ImportError as e:
                _error(e.msg)
            except NotFound as e:
                _error(e.name)
            except ValueError as e:
                _error(e.args[0])

    _error(f"Dataset '{identifier.dataset_slug}' does not exist locally. "
           f"Use 'darwin dataset remote' to see all the available datasets, "
           f"and 'darwin dataset pull' to pull them.")
예제 #22
0
def export_dataset(dataset_slug: str,
                   annotation_class_ids: Optional[List] = None,
                   name: Optional[str] = None):
    """Create a new release for the dataset

    Parameters
    ----------
    dataset_slug: str
        Slug of the dataset to which we perform the operation on
    annotation_class_ids: List
        List of the classes to filter
    name: str
        Name of the release
    """
    client = _load_client(offline=False)
    identifier = DatasetIdentifier.parse(dataset_slug)
    ds = client.get_remote_dataset(identifier)
    ds.export(annotation_class_ids=annotation_class_ids, name=name)
    identifier.version = name
    print(f"Dataset {dataset_slug} successfully exported to {identifier}")
예제 #23
0
def create_dataset(dataset_slug: str) -> None:
    """
    Creates a dataset remotely. Exits the application if the dataset's name is already taken or is
    not valid.

    Parameters
    ----------
    dataset_slug : str
        Slug of the new dataset.
    """
    identifier: DatasetIdentifier = DatasetIdentifier.parse(dataset_slug)
    client: Client = _load_client(team_slug=identifier.team_slug)
    try:
        dataset: RemoteDataset = client.create_dataset(
            name=identifier.dataset_slug)
        print(
            f"Dataset '{dataset.name}' ({dataset.team}/{dataset.slug}) has been created.\nAccess at {dataset.remote_path}"
        )
        print_new_version_info(client)
    except NameTaken:
        _error(f"Dataset name '{identifier.dataset_slug}' is already taken.")
    except ValidationError:
        _error(f"Dataset name '{identifier.dataset_slug}' is not valid.")
예제 #24
0
 def raises_with_team_only():
     with pytest.raises(ValueError):
         DatasetIdentifier.parse("team/")
예제 #25
0
 def optional_team_with_version():
     dataset_identifier = DatasetIdentifier.parse("dataset:1.0")
     assert dataset_identifier.team_slug is None
     assert dataset_identifier.dataset_slug == "dataset"
     assert dataset_identifier.version == "1.0"
예제 #26
0
 def with_numbers():
     dataset_identifier = DatasetIdentifier.parse("team1/dataset1")
     assert dataset_identifier.team_slug == "team1"
     assert dataset_identifier.dataset_slug == "dataset1"
     assert dataset_identifier.version is None
예제 #27
0
 def identifier(self) -> DatasetIdentifier:
     return DatasetIdentifier(team_slug=self.team, dataset_slug=self.slug)
예제 #28
0
def detectron2_register_dataset(
    dataset: str,
    release_name: Optional[str] = "latest",
    partition: Optional[str] = None,
    split: Optional[str] = "default",
    split_type: Optional[str] = "stratified",
    evaluator_type: Optional[str] = None,
) -> str:
    """Registers a local Darwin-formatted dataset in Detectron2

    Parameters
    ----------
    dataset: str
        Dataset slug
    release_name: str
        Version of the dataset
    partition: str
        Selects one of the partitions [train, val, test]
    split
        Selects the split that defines the percetages used (use 'default' to select the default split)
    split_type: str
        Heuristic used to do the split [random, stratified]
    evaluator_type: str
        Evaluator to be used in the val and test sets
    """
    try:
        from detectron2.data import DatasetCatalog, MetadataCatalog
    except ImportError:
        print("Detectron2 not found.")
        sys.exit(1)
    from darwin.dataset.utils import get_annotations, get_classes

    dataset_path: Optional[Path] = None
    if os.path.isdir(dataset):
        dataset_path = Path(dataset)
    else:
        identifier = DatasetIdentifier.parse(dataset)
        if identifier.version:
            release_name = identifier.version

        client = _load_client(offline=True)
        dataset_path = None
        for path in client.list_local_datasets(team_slug=identifier.team_slug):
            if identifier.dataset_slug == path.name:
                dataset_path = path

        if not dataset_path:
            _error(
                f"Dataset '{identifier.dataset_slug}' does not exist locally. "
                f"Use 'darwin dataset remote' to see all the available datasets, "
                f"and 'darwin dataset pull' to pull them.")

    catalog_name = f"darwin_{dataset_path.name}"
    if partition:
        catalog_name += f"_{partition}"

    classes = get_classes(dataset_path=dataset_path,
                          release_name=release_name,
                          annotation_type="polygon")

    DatasetCatalog.register(
        catalog_name,
        lambda partition=partition: list(
            get_annotations(
                dataset_path,
                partition=partition,
                split=split,
                split_type=split_type,
                release_name=release_name,
                annotation_type="polygon",
                annotation_format="coco",
                ignore_inconsistent_examples=True,
            )),
    )
    MetadataCatalog.get(catalog_name).set(thing_classes=classes)
    if evaluator_type:
        MetadataCatalog.get(catalog_name).set(evaluator_type=evaluator_type)
    return catalog_name
예제 #29
0
 def identifier(self) -> DatasetIdentifier:
     return DatasetIdentifier(team_slug=self.team_slug,
                              dataset_slug=self.dataset_slug,
                              version=self.name)
예제 #30
0
 def with_dashes():
     dataset_identifier = DatasetIdentifier.parse("my-team/my-dataset")
     assert dataset_identifier.team_slug == "my-team"
     assert dataset_identifier.dataset_slug == "my-dataset"
     assert dataset_identifier.version is None