Пример #1
0
def _load_client(
    team_slug: Optional[str] = None,
    offline: bool = False,
    maybe_guest: bool = False,
    dataset_identifier: Optional[str] = None,
):
    """Fetches a client, potentially offline

    Parameters
    ----------
    offline : bool
        Flag for using an offline client

    maybe_guest : bool
        Flag to make a guest client, if config is missing
    Returns
    -------
    Client
    The client requested
    """
    if not team_slug and dataset_identifier:
        team_slug = DatasetIdentifier.parse(dataset_identifier).team_slug
    try:
        config_dir = Path.home() / ".darwin" / "config.yaml"
        client = Client.from_config(config_dir, team_slug=team_slug)
        return client
    except MissingConfig:
        if maybe_guest:
            return Client.from_guest()
        else:
            _error("Authenticate first")
    except InvalidLogin:
        _error("Please re-authenticate")
    except Unauthenticated:
        _error("Please re-authenticate")
Пример #2
0
    def it_returns_remote_files(darwin_client: Client):
        dataset_id = 1
        endpoint: str = f"/datasets/{dataset_id}/items?page%5Bsize%5D=500&page%5Bfrom%5D=0"
        responses.add(responses.POST,
                      darwin_client.url + endpoint,
                      json={},
                      status=200)

        darwin_client.fetch_remote_files(dataset_id, {
            "page[size]": 500,
            "page[from]": 0
        }, {}, "v7")
Пример #3
0
def run_demo(
    *,
    team_slug: Optional[str],
    dataset_slug: Optional[str] = None,
    datasets_dir: Optional[str] = None,
    api_key: Optional[str] = None,
    config_path: Optional[Path] = None,
):
    """
    Download a Darwin dataset on the file system.

    Parameters
    ----------
    team_slug : str
        Slug of the team to select
    dataset_slug : str
        This is the dataset name with everything lower-case, removed specials characters and
        spaces are replaced by dashes, e.g., `bird-species`. This string is unique within a team
    datasets_dir : Path
        Path where the client should be initialized from (aka the root path)
    api_key: str
        API key to authenticate the client
    config_path: Path
        Path to a configuration path which contains the authentication information to use

    Returns
    -------
    splits : dict
        Keys are the different splits (random, tags, ...) and values are the relative file names
    """
    # Authenticate the new KEY if available
    if api_key is not None:
        authenticate(api_key=api_key,
                     default_team=True,
                     datasets_dir=datasets_dir)
    # Get the client used to perform remote operations
    if config_path is not None:
        client = Client.from_config(config_path=config_path)
    else:
        client = Client.local(team_slug=team_slug)
    # Create a dataset identifier
    dataset_identifier = DatasetIdentifier.from_slug(dataset_slug=dataset_slug,
                                                     team_slug=team_slug)
    # Get an object representing the remote dataset
    ds = client.get_remote_dataset(dataset_identifier=dataset_identifier)
    # Download the dataset on the local file system
    ds.pull()
    # Split the dataset in train/val/test
    splits = split_dataset(dataset=ds)
Пример #4
0
    def it_raises_if_workflow_id_is_not_found(darwin_client: Client):
        item_id: int = 1234
        endpoint: str = f"/dataset_items/{item_id}/workflow"
        json_response: Dict[str, Any] = {}

        responses.add(responses.POST,
                      darwin_client.url + endpoint,
                      json=json_response,
                      status=200)

        with pytest.raises(ValueError) as exception:
            darwin_client.instantitate_item(item_id)

        assert str(
            exception.value) == f"No Workflow Id found for item_id: {item_id}"
Пример #5
0
def darwin_client(darwin_config_path: Path, darwin_datasets_path: Path, team_slug: str) -> Client:
    config = Config(darwin_config_path)
    config.put(["global", "api_endpoint"], "http://localhost/api")
    config.put(["global", "base_url"], "http://localhost")
    config.put(["teams", team_slug, "api_key"], "mock_api_key")
    config.put(["teams", team_slug, "datasets_dir"], str(darwin_datasets_path))
    return Client(config=config)
Пример #6
0
    def it_raises_if_comment_id_is_not_found(darwin_client: Client):
        workflow_id = 1234
        endpoint: str = f"/workflows/{workflow_id}/workflow_comment_threads"
        json_response: Dict[str, Any] = {}

        responses.add(responses.POST,
                      darwin_client.url + endpoint,
                      json=json_response,
                      status=200)

        with pytest.raises(ValueError) as exception:
            darwin_client.post_workflow_comment(workflow_id, "My comment.")

        assert str(
            exception.value
        ) == f"Unable to retrieve comment id for workflow: {workflow_id}."
Пример #7
0
def remote_dataset(dataset_slug: str, local_config_file: Config):
    client = Client(local_config_file)
    return RemoteDataset(client=client,
                         team="v7",
                         name="TEST_DATASET",
                         slug=dataset_slug,
                         dataset_id=1)
Пример #8
0
    def it_returns_the_created_dataset(darwin_client: Client):
        endpoint: str = "/datasets"
        json_response: Dict[str, Any] = {
            "name": "my-dataset",
            "slug": "my-dataset",
            "id": 1,
            "num_images": 1,
            "num_videos": 0,
            "progress": 0,
        }

        responses.add(responses.POST,
                      darwin_client.url + endpoint,
                      json=json_response,
                      status=200)

        actual_dataset = darwin_client.create_dataset("my-dataset", "v7")
        expected_dataset = RemoteDataset(
            team="v7",
            name="my-dataset",
            slug="my-dataset",
            dataset_id=1,
            item_count=1,
            client=darwin_client,
        )

        assert_dataset(actual_dataset, expected_dataset)
Пример #9
0
    def it_returns_the_dataset(darwin_client: Client):
        endpoint: str = "/datasets"
        json_response = [{
            "name": "dataset-name-1",
            "slug": "dataset-slug-1",
            "id": 1,
            "num_images": 1,
            "num_videos": 0,
            "progress": 0,
        }]

        responses.add(responses.GET,
                      darwin_client.url + endpoint,
                      json=json_response,
                      status=200)

        actual_dataset = darwin_client.get_remote_dataset("v7/dataset-slug-1")
        expected_dataset = RemoteDataset(
            team="v7",
            name="dataset-name-1",
            slug="dataset-slug-1",
            dataset_id=1,
            item_count=1,
            client=darwin_client,
        )

        assert_dataset(actual_dataset, expected_dataset)
Пример #10
0
def get_dataset_files(dataset_slug) -> Iterator[DatasetItem]:
    """ Return a list of filenames in a dataset along with their status """
    client = Client.from_api_key(API_KEY)
    identifier = DatasetIdentifier.parse(dataset_slug)
    dataset = client.get_remote_dataset(dataset_identifier=identifier)
    resp = dataset.fetch_remote_files()

    return resp
Пример #11
0
def add_labels_to_dataset(dataset_slug: str, labels: List[str], label_type: str):
    """ Add labels to a dataset """
    assert label_type in ['polygon', 'tag']
    client = Client.from_api_key(API_KEY)
    identifier = DatasetIdentifier.parse(dataset_slug)
    dataset = client.get_remote_dataset(dataset_identifier=identifier)

    for label in labels:
        dataset.create_annotation_class(label, label_type)
Пример #12
0
    def it_raises_if_dataset_is_not_found(darwin_client: Client):
        endpoint: str = "/datasets"
        json_response = [{
            "name": "dataset-name-1",
            "slug": "dataset-slug-1",
            "id": 1,
            "num_images": 1,
            "num_videos": 0,
            "progress": 0,
        }]

        responses.add(responses.GET,
                      darwin_client.url + endpoint,
                      json=json_response,
                      status=200)

        with pytest.raises(NotFound):
            darwin_client.get_remote_dataset("v7/dataset-slug-2")
Пример #13
0
def populate_dataset_annotations(dataset_slug, format_name: str, file_paths: List[str]):
    assert format_name in ['darwin', 'coco', 'pascal_voc']
    client = Client.from_api_key(API_KEY)
    identifier = DatasetIdentifier.parse(dataset_slug)
    dataset = client.get_remote_dataset(dataset_identifier=identifier)

    format_dict = {k: v for (k, v) in darwin.importer.formats.supported_formats}
    parser = format_dict[format_name]

    importer.import_annotations(dataset, parser, file_paths)
Пример #14
0
    def it_returns_workflow_id(darwin_client: Client):
        item_id: int = 1234
        workflow_id: int = 1
        endpoint: str = f"/dataset_items/{item_id}/workflow"
        json_response: Dict[str, Any] = {"current_workflow_id": workflow_id}

        responses.add(responses.POST,
                      darwin_client.url + endpoint,
                      json=json_response,
                      status=200)
        assert darwin_client.instantitate_item(item_id) == workflow_id
Пример #15
0
    def it_returns_comment_id(darwin_client: Client):
        comment_id: int = 1234
        workflow_id: int = 1
        endpoint: str = f"/workflows/{workflow_id}/workflow_comment_threads"
        json_response: Dict[str, Any] = {"id": comment_id}

        responses.add(responses.POST,
                      darwin_client.url + endpoint,
                      json=json_response,
                      status=200)
        assert darwin_client.post_workflow_comment(workflow_id,
                                                   "My comment.") == comment_id
Пример #16
0
def create_dataset(dataset_slug):
    """ Create new empty dataset """
    client = Client.from_api_key(API_KEY)
    identifier = DatasetIdentifier.parse(dataset_slug)
    dataset = client.create_dataset(name=identifier.dataset_slug)

    dataset_ifo = dict(
        name=dataset.name,
        id=dataset.dataset_id,
        slug=dataset.slug,
        remote_path=dataset.remote_path
    )
    return dataset_ifo
Пример #17
0
def authenticate(api_key: str,
                 default_team: Optional[bool] = None,
                 datasets_dir: Optional[Path] = None) -> Config:
    """
    Authenticate the API key against the server and creates a configuration file for it.

    Parameters
    ----------
    api_key : str
        API key to use for the client login.
    default_team: Optional[bool]
        Flag to make the team the default one. Defaults to None.
    datasets_dir: Optional[Path]
        Dataset directory on the file system. Defaults to None.

    Returns
    -------
    Config
    A configuration object to handle YAML files.
    """
    # Resolve the home folder if the dataset_dir starts with ~ or ~user

    validate_api_key(api_key)

    try:
        client = Client.from_api_key(api_key=api_key)
        config_path = Path.home() / ".darwin" / "config.yaml"
        config_path.parent.mkdir(exist_ok=True)

        if default_team is None:
            default_team = input(
                f"Make {client.default_team} the default team? [y/N] ") in [
                    "Y", "y"
                ]
        if datasets_dir is None:
            datasets_dir = Path(
                prompt("Datasets directory", "~/.darwin/datasets"))

        datasets_dir = Path(datasets_dir).expanduser()
        Path(datasets_dir).mkdir(parents=True, exist_ok=True)

        client.set_datasets_dir(datasets_dir)

        default_team_name: Optional[
            str] = client.default_team if default_team else None
        return persist_client_configuration(client,
                                            default_team=default_team_name)

    except InvalidLogin:
        _error("Invalid API key")
Пример #18
0
    def it_returns_list_of_datasets(darwin_client: Client):
        team_slug: str = "v7"
        endpoint: str = "/datasets"
        json_response: List[Dict[str, Any]] = [
            {
                "name": "dataset-name-1",
                "slug": "dataset-slug-1",
                "id": 1,
                "num_images": 1,
                "num_videos": 0,
                "progress": 0,
            },
            {
                "name": "dataset-name-2",
                "slug": "dataset-slug-2",
                "id": 2,
                "num_images": 2,
                "num_videos": 0,
                "progress": 0,
            },
        ]

        responses.add(responses.GET,
                      darwin_client.url + endpoint,
                      json=json_response,
                      status=200)

        remote_datasets = list(darwin_client.list_remote_datasets(team_slug))
        expected_dataset_1 = RemoteDataset(
            team=team_slug,
            name="dataset-name-1",
            slug="dataset-slug-1",
            dataset_id=1,
            item_count=1,
            client=darwin_client,
        )
        expected_dataset_2 = RemoteDataset(
            team=team_slug,
            name="dataset-name-2",
            slug="dataset-slug-2",
            dataset_id=2,
            item_count=2,
            client=darwin_client,
        )

        assert_dataset(remote_datasets[0], expected_dataset_1)
        assert_dataset(remote_datasets[1], expected_dataset_2)
Пример #19
0
def _populate_dataset(dataset_slug, items):
    client = Client.from_api_key(API_KEY)
    identifier = DatasetIdentifier.parse(dataset_slug)
    dataset = client.get_remote_dataset(dataset_identifier=identifier)

    item_batches = [x.tolist() for x in np.array_split(items, min(len(items), 100))]
    for idx, batch in enumerate(item_batches):
        print(f'Batch {idx + 1}/{len(item_batches)}')
        payload = {
            'files': batch
        }
        print(payload)
        print(dataset.dataset_id)
        response = requests.put(f'https://darwin.v7labs.com/api/datasets/{dataset.dataset_id}/external_data', headers=HEADERS,
                                json=payload)

        response.raise_for_status()
Пример #20
0
    def it_returns_remote_classes(darwin_client: Client):
        team_slug: str = "v7"
        endpoint: str = f"/teams/{team_slug}/annotation_classes?include_tags=true"
        response: Dict[str, Any] = {
            "annotation_classes": [{
                "annotation_class_image_url": None,
                "annotation_types": ["tag"],
                "dataset_id": 215,
                "datasets": [{
                    "id": 215
                }, {
                    "id": 265
                }],
                "description": " Tag 2",
                "id": 345,
                "images": [],
                "inserted_at": "2021-01-25T02:27:10",
                "metadata": {
                    "_color": "rgba(0,255,0,1.0)",
                    "tag": {}
                },
                "name": " Tag 2",
                "team_id": 2,
                "updated_at": "2021-01-25T02:27:10",
            }]
        }

        responses.add(responses.GET,
                      darwin_client.url + endpoint,
                      json=response,
                      status=200)

        result: List[Dict[str,
                          Any]] = darwin_client.fetch_remote_classes(team_slug)
        annotation_class: Dict[str, Any] = result[0]

        assert annotation_class["annotation_class_image_url"] is None
        assert annotation_class["annotation_types"] == ["tag"]
        assert annotation_class["dataset_id"] == 215
        assert annotation_class["datasets"] == [{"id": 215}, {"id": 265}]
        assert annotation_class["id"] == 345
Пример #21
0
def _load_client(team: Optional[str] = None, offline: bool = False):
    """Fetches a client, potentially offline

    Parameters
    ----------
    offline : bool
        Flag for using an offline client

    Returns
    -------
    Client
    The client requested
    """
    try:
        config_dir = Path.home() / ".darwin" / "config.yaml"
        client = Client.from_config(config_dir, team_slug=team)
        return client
    except MissingConfig:
        _error("Authenticate first")
    except InvalidLogin:
        _error("Please re-authenticate")
    except Unauthenticated:
        _error("Please re-authenticate")
Пример #22
0
    def it_returns_list_of_features(darwin_client: Client):
        team_slug: str = "v7"
        endpoint: str = f"/teams/{team_slug}/features"
        json_response = [
            {
                "enabled": False,
                "name": "WORKFLOW_V2"
            },
            {
                "enabled": True,
                "name": "BLIND_STAGE"
            },
        ]

        responses.add(responses.GET,
                      darwin_client.url + endpoint,
                      json=json_response,
                      status=200)

        assert darwin_client.get_team_features(team_slug) == [
            Feature(name="WORKFLOW_V2", enabled=False),
            Feature(name="BLIND_STAGE", enabled=True),
        ]
Пример #23
0
def get_darwin_dataset(
        *,
        team_slug: Optional[str] = None,
        dataset_slug: Optional[str] = None,
        dataset_id: Optional[str] = None,
        projects_dir: Optional[str] = None,
        token: Optional[str] = None,
        config_path: Optional[Path] = None,
        email: Optional[str] = None,
        password: Optional[str] = None,
        val_percentage: Optional[float] = 0.1,
        test_percentage: Optional[float] = 0.2,
        force_resplit: Optional[bool] = False,
        split_seed: Optional[int] = 42
):
    """
    Download a Darwin dataset on the file system.
    It is possible to select the way to authenticate and the configuration of
    the split of the dataset

    Parameters
    ----------
    team_slug : str
        Slug of the team to select
    dataset_slug : str
        This is the dataset name with everything lower-case, removed specials characters and
        spaces are replaced by dashes, e.g., `bird-species`. This string is unique within a team
    projects_dir : Path
        Path where the client should be initialized from (aka the root path)
    token : str
        Access token used to auth a specific request. It has a time spans of roughly 8min. to
    config_path : str
        Path to a configuration file to use to create the client
    email : str
        Email of the Darwin user to use for the login
    password : str
        Password of the Darwin user to use for the login
    val_percentage : float
        Percentage of images used in the validation set
    test_percentage : float
        Percentage of images used in the test set
    force_resplit : bool
        Discard previous split and create a new one
    split_seed : in
        Fix seed for random split creation

    Returns
    -------
    splits : dict
        Keys are the different splits (random, tags, ...) and values are the relative file names
    """
    # Authenticate client. The priority of the cases is arbitrarily chosen and should actually not matter
    if email is not None and password is not None:
        client = Client.login(email=email, password=password, projects_dir=projects_dir)
    elif token is not None:
        client = Client.from_token(token=token, projects_dir=projects_dir)
    elif config_path is not None:
        client = Client.from_config(config_path=config_path)
    else:
        client = Client.default(projects_dir=projects_dir)

    # Select the desired team
    if team_slug is not None:
        client.set_team(slug=team_slug)
    # Get the remote dataset
    dataset = client.get_remote_dataset(slug=dataset_slug, dataset_id=dataset_id)
    # Download the data on the file system
    dataset.pull()
    # Split the dataset with the param required
    return split_dataset(
        dataset=dataset,
        val_percentage=val_percentage,
        test_percentage=test_percentage,
        force_resplit=force_resplit,
        split_seed=split_seed
    )
Пример #24
0
def get_annotations(dataset_slug, anno_dest_dir='annos', *, clear_directory=False, verbose=False):
    """ Get all annotations for a dataset

    dataset_name: name of the dataset to retrieve annotations for
    anno_dest_dir: directory to store the annotation files
    clear_directory: delete all existing files in target directory if they exist (if False raise an error if files exist)
    verbose: log API responses
    """
    client = Client.from_api_key(API_KEY)
    identifier = DatasetIdentifier.parse(dataset_slug)
    dataset = client.get_remote_dataset(dataset_identifier=identifier)

    filters = {'statuses': 'review,complete'}
    ids = [file.id for file in dataset.fetch_remote_files(filters)]

    # darwin-py doesn't support dataset_item_ids
    # uses also /datasets/{self.dataset_id}/exports
    # dataset.export(annotation_class_ids=annotation_class_ids, name=name, include_url_token=include_url_token)

    export_name = 'export_tmp'
    print(ids)

    payload = dict(
        format='json',
        name=export_name,
        include_authorship=True,
        include_export_token=True,
        dataset_item_ids=ids
    )

    print('Creating export...')
    response_create = requests.post(f'https://darwin.v7labs.com/api/teams/{TEAM_SLUG}/datasets/{dataset_slug}/exports',
                                    headers=HEADERS,
                                    json=payload)
    response_create.raise_for_status()
    if verbose:
        pprint.pprint(['create_export', response_create.json()])

    def get_export(timeout=60):
        waiting_for_export = True
        timeout_stop = time.time() + timeout
        while waiting_for_export:
            response_retrieve = requests.get(f'https://darwin.v7labs.com/api/teams/{TEAM_SLUG}/datasets/{dataset_slug}/exports', headers=HEADERS)
            if verbose:
                pprint.pprint(['get_export', response_retrieve.json()])
            response_retrieve.raise_for_status()
            exports = list(filter(lambda x: x['name'] == export_name, response_retrieve.json()))
            if len(exports) == 1 and exports[0]['latest']:
                return exports[0]
            else:
                if time.time() > timeout_stop:
                    raise RuntimeError('Timeout whilst waiting for export to complete')
            time.sleep(0.5)
            if verbose:
                print('trying again...')

    try:
        print('Waiting for export to complete...')
        export = get_export()

        # download export data
        # (this is also available through dataset.annotations as a single dict? maybe deprecated?)
        print('Downloading annotations...')
        with requests.get(export['download_url'], stream=True) as r:
            r.raise_for_status()
            if verbose:
                pprint.pprint(['download_annos', r.status_code])
            with mktmpdir() as tmp_dir:
                tmp_file = os.path.join(tmp_dir, 'export.zip')
                with open(tmp_file, 'wb') as f:
                    for chunk in r.iter_content(chunk_size=8192):
                        f.write(chunk)
                if os.path.exists(anno_dest_dir):
                    anno_files = os.listdir(anno_dest_dir)
                    if len(anno_files) > 0:
                        if clear_directory:
                            for file in anno_files:
                                os.remove(os.path.join(anno_dest_dir, file))
                        else:
                            raise RuntimeError('Directory already exists and contains files!')
                else:
                    os.makedirs(anno_dest_dir)
                with ZipFile(tmp_file, 'r') as f:
                    f.extractall(anno_dest_dir)
                anno_paths = [os.path.join(anno_dest_dir, x) for x in os.listdir(anno_dest_dir)]
    except Exception as e:
        response_delete = requests.delete(f'https://darwin.v7labs.com/api/teams/{TEAM_SLUG}/datasets/{dataset_slug}/exports/{export_name}', headers=HEADERS)
        response_delete.raise_for_status()
        if verbose:
            pprint.pprint(['delete_export', response_delete.status_code])
        raise e

    print('Export completed, cleaning up...')
    response_delete = requests.delete(f'https://darwin.v7labs.com/api/teams/{TEAM_SLUG}/datasets/{dataset_slug}/exports/{export_name}', headers=HEADERS)
    response_delete.raise_for_status()

    del export['download_url']
    export['annotation_paths'] = anno_paths
    return export