Exemplo n.º 1
0
    def split(
        self,
        val_percentage: float = 0.1,
        test_percentage: float = 0,
        split_seed: int = 0,
        make_default_split: bool = True,
        release_name: Optional[str] = None,
    ):
        """
        Creates lists of file names for each split for train, validation, and test.
        Note: This functions needs a local copy of the dataset

        Parameters
        ----------
        val_percentage : float
            Percentage of images used in the validation set
        test_percentage : float
            Percentage of images used in the test set
        force_resplit : bool
            Discard previous split and create a new one
        split_seed : int
            Fix seed for random split creation
        make_default_split: bool
            Makes this split the default split
        release_name: str
            Version of the dataset
        """
        if not self.local_path.exists():
            raise NotFound(
                "Local dataset not found: the split is performed on the local copy of the dataset. \
                           Pull the dataset from Darwin first using pull()"
            )
        if release_name in ["latest", None]:
            release = self.get_release("latest")
            release_name = release.name

        split_dataset(
            self.local_path,
            release_name=release_name,
            val_percentage=val_percentage,
            test_percentage=test_percentage,
            split_seed=split_seed,
            make_default_split=make_default_split,
        )
Exemplo n.º 2
0
def run_demo(
    *,
    team_slug: Optional[str],
    dataset_slug: Optional[str] = None,
    datasets_dir: Optional[str] = None,
    api_key: Optional[str] = None,
    config_path: Optional[Path] = None,
):
    """
    Download a Darwin dataset on the file system.

    Parameters
    ----------
    team_slug : str
        Slug of the team to select
    dataset_slug : str
        This is the dataset name with everything lower-case, removed specials characters and
        spaces are replaced by dashes, e.g., `bird-species`. This string is unique within a team
    datasets_dir : Path
        Path where the client should be initialized from (aka the root path)
    api_key: str
        API key to authenticate the client
    config_path: Path
        Path to a configuration path which contains the authentication information to use

    Returns
    -------
    splits : dict
        Keys are the different splits (random, tags, ...) and values are the relative file names
    """
    # Authenticate the new KEY if available
    if api_key is not None:
        authenticate(api_key=api_key,
                     default_team=True,
                     datasets_dir=datasets_dir)
    # Get the client used to perform remote operations
    if config_path is not None:
        client = Client.from_config(config_path=config_path)
    else:
        client = Client.local(team_slug=team_slug)
    # Create a dataset identifier
    dataset_identifier = DatasetIdentifier.from_slug(dataset_slug=dataset_slug,
                                                     team_slug=team_slug)
    # Get an object representing the remote dataset
    ds = client.get_remote_dataset(dataset_identifier=dataset_identifier)
    # Download the dataset on the local file system
    ds.pull()
    # Split the dataset in train/val/test
    splits = split_dataset(dataset=ds)
Exemplo n.º 3
0
def split(dataset_slug: str,
          val_percentage: float,
          test_percentage: float,
          seed: Optional[int] = 0):
    """Splits a local version of a dataset into train, validation, and test partitions

    Parameters
    ----------
    dataset_slug: str
        Slug of the dataset to which we perform the operation on
    val_percentage: float
        Percentage in the validation set
    test_percentage: float
        Percentage in the test set
    seed: int
        Random seed
    """
    identifier = DatasetIdentifier.parse(dataset_slug)
    client = _load_client(offline=True)

    for p in client.list_local_datasets(team=identifier.team_slug):
        if identifier.dataset_slug == p.name:
            try:
                split_path = split_dataset(
                    dataset_path=p,
                    release_name=identifier.version,
                    val_percentage=val_percentage,
                    test_percentage=test_percentage,
                    split_seed=seed,
                )
                print(f"Partition lists saved at {split_path}")
                return
            except NotFound as e:
                _error(e.name)
            except ValueError as e:
                _error(e.args[0])

    for p in client.list_deprecated_local_datasets(team=identifier.team_slug):
        if identifier.dataset_slug == p.name:
            _error(
                f"Found a local version of the dataset {identifier.dataset_slug} which uses a deprecated format. "
                f"Run `darwin dataset migrate {identifier}` if you want to be able to use it in darwin-py."
            )

    _error(f"Dataset '{identifier.dataset_slug}' does not exist locally. "
           f"Use 'darwin dataset remote' to see all the available datasets, "
           f"and 'darwin dataset pull' to pull them.")
Exemplo n.º 4
0
def get_darwin_dataset(
        *,
        team_slug: Optional[str] = None,
        dataset_slug: Optional[str] = None,
        dataset_id: Optional[str] = None,
        projects_dir: Optional[str] = None,
        token: Optional[str] = None,
        config_path: Optional[Path] = None,
        email: Optional[str] = None,
        password: Optional[str] = None,
        val_percentage: Optional[float] = 0.1,
        test_percentage: Optional[float] = 0.2,
        force_resplit: Optional[bool] = False,
        split_seed: Optional[int] = 42
):
    """
    Download a Darwin dataset on the file system.
    It is possible to select the way to authenticate and the configuration of
    the split of the dataset

    Parameters
    ----------
    team_slug : str
        Slug of the team to select
    dataset_slug : str
        This is the dataset name with everything lower-case, removed specials characters and
        spaces are replaced by dashes, e.g., `bird-species`. This string is unique within a team
    projects_dir : Path
        Path where the client should be initialized from (aka the root path)
    token : str
        Access token used to auth a specific request. It has a time spans of roughly 8min. to
    config_path : str
        Path to a configuration file to use to create the client
    email : str
        Email of the Darwin user to use for the login
    password : str
        Password of the Darwin user to use for the login
    val_percentage : float
        Percentage of images used in the validation set
    test_percentage : float
        Percentage of images used in the test set
    force_resplit : bool
        Discard previous split and create a new one
    split_seed : in
        Fix seed for random split creation

    Returns
    -------
    splits : dict
        Keys are the different splits (random, tags, ...) and values are the relative file names
    """
    # Authenticate client. The priority of the cases is arbitrarily chosen and should actually not matter
    if email is not None and password is not None:
        client = Client.login(email=email, password=password, projects_dir=projects_dir)
    elif token is not None:
        client = Client.from_token(token=token, projects_dir=projects_dir)
    elif config_path is not None:
        client = Client.from_config(config_path=config_path)
    else:
        client = Client.default(projects_dir=projects_dir)

    # Select the desired team
    if team_slug is not None:
        client.set_team(slug=team_slug)
    # Get the remote dataset
    dataset = client.get_remote_dataset(slug=dataset_slug, dataset_id=dataset_id)
    # Download the data on the file system
    dataset.pull()
    # Split the dataset with the param required
    return split_dataset(
        dataset=dataset,
        val_percentage=val_percentage,
        test_percentage=test_percentage,
        force_resplit=force_resplit,
        split_seed=split_seed
    )