def _load_client( team_slug: Optional[str] = None, offline: bool = False, maybe_guest: bool = False, dataset_identifier: Optional[str] = None, ): """Fetches a client, potentially offline Parameters ---------- offline : bool Flag for using an offline client maybe_guest : bool Flag to make a guest client, if config is missing Returns ------- Client The client requested """ if not team_slug and dataset_identifier: team_slug = DatasetIdentifier.parse(dataset_identifier).team_slug try: config_dir = Path.home() / ".darwin" / "config.yaml" client = Client.from_config(config_dir, team_slug=team_slug) return client except MissingConfig: if maybe_guest: return Client.from_guest() else: _error("Authenticate first") except InvalidLogin: _error("Please re-authenticate") except Unauthenticated: _error("Please re-authenticate")
def it_returns_remote_files(darwin_client: Client): dataset_id = 1 endpoint: str = f"/datasets/{dataset_id}/items?page%5Bsize%5D=500&page%5Bfrom%5D=0" responses.add(responses.POST, darwin_client.url + endpoint, json={}, status=200) darwin_client.fetch_remote_files(dataset_id, { "page[size]": 500, "page[from]": 0 }, {}, "v7")
def run_demo( *, team_slug: Optional[str], dataset_slug: Optional[str] = None, datasets_dir: Optional[str] = None, api_key: Optional[str] = None, config_path: Optional[Path] = None, ): """ Download a Darwin dataset on the file system. Parameters ---------- team_slug : str Slug of the team to select dataset_slug : str This is the dataset name with everything lower-case, removed specials characters and spaces are replaced by dashes, e.g., `bird-species`. This string is unique within a team datasets_dir : Path Path where the client should be initialized from (aka the root path) api_key: str API key to authenticate the client config_path: Path Path to a configuration path which contains the authentication information to use Returns ------- splits : dict Keys are the different splits (random, tags, ...) and values are the relative file names """ # Authenticate the new KEY if available if api_key is not None: authenticate(api_key=api_key, default_team=True, datasets_dir=datasets_dir) # Get the client used to perform remote operations if config_path is not None: client = Client.from_config(config_path=config_path) else: client = Client.local(team_slug=team_slug) # Create a dataset identifier dataset_identifier = DatasetIdentifier.from_slug(dataset_slug=dataset_slug, team_slug=team_slug) # Get an object representing the remote dataset ds = client.get_remote_dataset(dataset_identifier=dataset_identifier) # Download the dataset on the local file system ds.pull() # Split the dataset in train/val/test splits = split_dataset(dataset=ds)
def it_raises_if_workflow_id_is_not_found(darwin_client: Client): item_id: int = 1234 endpoint: str = f"/dataset_items/{item_id}/workflow" json_response: Dict[str, Any] = {} responses.add(responses.POST, darwin_client.url + endpoint, json=json_response, status=200) with pytest.raises(ValueError) as exception: darwin_client.instantitate_item(item_id) assert str( exception.value) == f"No Workflow Id found for item_id: {item_id}"
def darwin_client(darwin_config_path: Path, darwin_datasets_path: Path, team_slug: str) -> Client: config = Config(darwin_config_path) config.put(["global", "api_endpoint"], "http://localhost/api") config.put(["global", "base_url"], "http://localhost") config.put(["teams", team_slug, "api_key"], "mock_api_key") config.put(["teams", team_slug, "datasets_dir"], str(darwin_datasets_path)) return Client(config=config)
def it_raises_if_comment_id_is_not_found(darwin_client: Client): workflow_id = 1234 endpoint: str = f"/workflows/{workflow_id}/workflow_comment_threads" json_response: Dict[str, Any] = {} responses.add(responses.POST, darwin_client.url + endpoint, json=json_response, status=200) with pytest.raises(ValueError) as exception: darwin_client.post_workflow_comment(workflow_id, "My comment.") assert str( exception.value ) == f"Unable to retrieve comment id for workflow: {workflow_id}."
def remote_dataset(dataset_slug: str, local_config_file: Config): client = Client(local_config_file) return RemoteDataset(client=client, team="v7", name="TEST_DATASET", slug=dataset_slug, dataset_id=1)
def it_returns_the_created_dataset(darwin_client: Client): endpoint: str = "/datasets" json_response: Dict[str, Any] = { "name": "my-dataset", "slug": "my-dataset", "id": 1, "num_images": 1, "num_videos": 0, "progress": 0, } responses.add(responses.POST, darwin_client.url + endpoint, json=json_response, status=200) actual_dataset = darwin_client.create_dataset("my-dataset", "v7") expected_dataset = RemoteDataset( team="v7", name="my-dataset", slug="my-dataset", dataset_id=1, item_count=1, client=darwin_client, ) assert_dataset(actual_dataset, expected_dataset)
def it_returns_the_dataset(darwin_client: Client): endpoint: str = "/datasets" json_response = [{ "name": "dataset-name-1", "slug": "dataset-slug-1", "id": 1, "num_images": 1, "num_videos": 0, "progress": 0, }] responses.add(responses.GET, darwin_client.url + endpoint, json=json_response, status=200) actual_dataset = darwin_client.get_remote_dataset("v7/dataset-slug-1") expected_dataset = RemoteDataset( team="v7", name="dataset-name-1", slug="dataset-slug-1", dataset_id=1, item_count=1, client=darwin_client, ) assert_dataset(actual_dataset, expected_dataset)
def get_dataset_files(dataset_slug) -> Iterator[DatasetItem]: """ Return a list of filenames in a dataset along with their status """ client = Client.from_api_key(API_KEY) identifier = DatasetIdentifier.parse(dataset_slug) dataset = client.get_remote_dataset(dataset_identifier=identifier) resp = dataset.fetch_remote_files() return resp
def add_labels_to_dataset(dataset_slug: str, labels: List[str], label_type: str): """ Add labels to a dataset """ assert label_type in ['polygon', 'tag'] client = Client.from_api_key(API_KEY) identifier = DatasetIdentifier.parse(dataset_slug) dataset = client.get_remote_dataset(dataset_identifier=identifier) for label in labels: dataset.create_annotation_class(label, label_type)
def it_raises_if_dataset_is_not_found(darwin_client: Client): endpoint: str = "/datasets" json_response = [{ "name": "dataset-name-1", "slug": "dataset-slug-1", "id": 1, "num_images": 1, "num_videos": 0, "progress": 0, }] responses.add(responses.GET, darwin_client.url + endpoint, json=json_response, status=200) with pytest.raises(NotFound): darwin_client.get_remote_dataset("v7/dataset-slug-2")
def populate_dataset_annotations(dataset_slug, format_name: str, file_paths: List[str]): assert format_name in ['darwin', 'coco', 'pascal_voc'] client = Client.from_api_key(API_KEY) identifier = DatasetIdentifier.parse(dataset_slug) dataset = client.get_remote_dataset(dataset_identifier=identifier) format_dict = {k: v for (k, v) in darwin.importer.formats.supported_formats} parser = format_dict[format_name] importer.import_annotations(dataset, parser, file_paths)
def it_returns_workflow_id(darwin_client: Client): item_id: int = 1234 workflow_id: int = 1 endpoint: str = f"/dataset_items/{item_id}/workflow" json_response: Dict[str, Any] = {"current_workflow_id": workflow_id} responses.add(responses.POST, darwin_client.url + endpoint, json=json_response, status=200) assert darwin_client.instantitate_item(item_id) == workflow_id
def it_returns_comment_id(darwin_client: Client): comment_id: int = 1234 workflow_id: int = 1 endpoint: str = f"/workflows/{workflow_id}/workflow_comment_threads" json_response: Dict[str, Any] = {"id": comment_id} responses.add(responses.POST, darwin_client.url + endpoint, json=json_response, status=200) assert darwin_client.post_workflow_comment(workflow_id, "My comment.") == comment_id
def create_dataset(dataset_slug): """ Create new empty dataset """ client = Client.from_api_key(API_KEY) identifier = DatasetIdentifier.parse(dataset_slug) dataset = client.create_dataset(name=identifier.dataset_slug) dataset_ifo = dict( name=dataset.name, id=dataset.dataset_id, slug=dataset.slug, remote_path=dataset.remote_path ) return dataset_ifo
def authenticate(api_key: str, default_team: Optional[bool] = None, datasets_dir: Optional[Path] = None) -> Config: """ Authenticate the API key against the server and creates a configuration file for it. Parameters ---------- api_key : str API key to use for the client login. default_team: Optional[bool] Flag to make the team the default one. Defaults to None. datasets_dir: Optional[Path] Dataset directory on the file system. Defaults to None. Returns ------- Config A configuration object to handle YAML files. """ # Resolve the home folder if the dataset_dir starts with ~ or ~user validate_api_key(api_key) try: client = Client.from_api_key(api_key=api_key) config_path = Path.home() / ".darwin" / "config.yaml" config_path.parent.mkdir(exist_ok=True) if default_team is None: default_team = input( f"Make {client.default_team} the default team? [y/N] ") in [ "Y", "y" ] if datasets_dir is None: datasets_dir = Path( prompt("Datasets directory", "~/.darwin/datasets")) datasets_dir = Path(datasets_dir).expanduser() Path(datasets_dir).mkdir(parents=True, exist_ok=True) client.set_datasets_dir(datasets_dir) default_team_name: Optional[ str] = client.default_team if default_team else None return persist_client_configuration(client, default_team=default_team_name) except InvalidLogin: _error("Invalid API key")
def it_returns_list_of_datasets(darwin_client: Client): team_slug: str = "v7" endpoint: str = "/datasets" json_response: List[Dict[str, Any]] = [ { "name": "dataset-name-1", "slug": "dataset-slug-1", "id": 1, "num_images": 1, "num_videos": 0, "progress": 0, }, { "name": "dataset-name-2", "slug": "dataset-slug-2", "id": 2, "num_images": 2, "num_videos": 0, "progress": 0, }, ] responses.add(responses.GET, darwin_client.url + endpoint, json=json_response, status=200) remote_datasets = list(darwin_client.list_remote_datasets(team_slug)) expected_dataset_1 = RemoteDataset( team=team_slug, name="dataset-name-1", slug="dataset-slug-1", dataset_id=1, item_count=1, client=darwin_client, ) expected_dataset_2 = RemoteDataset( team=team_slug, name="dataset-name-2", slug="dataset-slug-2", dataset_id=2, item_count=2, client=darwin_client, ) assert_dataset(remote_datasets[0], expected_dataset_1) assert_dataset(remote_datasets[1], expected_dataset_2)
def _populate_dataset(dataset_slug, items): client = Client.from_api_key(API_KEY) identifier = DatasetIdentifier.parse(dataset_slug) dataset = client.get_remote_dataset(dataset_identifier=identifier) item_batches = [x.tolist() for x in np.array_split(items, min(len(items), 100))] for idx, batch in enumerate(item_batches): print(f'Batch {idx + 1}/{len(item_batches)}') payload = { 'files': batch } print(payload) print(dataset.dataset_id) response = requests.put(f'https://darwin.v7labs.com/api/datasets/{dataset.dataset_id}/external_data', headers=HEADERS, json=payload) response.raise_for_status()
def it_returns_remote_classes(darwin_client: Client): team_slug: str = "v7" endpoint: str = f"/teams/{team_slug}/annotation_classes?include_tags=true" response: Dict[str, Any] = { "annotation_classes": [{ "annotation_class_image_url": None, "annotation_types": ["tag"], "dataset_id": 215, "datasets": [{ "id": 215 }, { "id": 265 }], "description": " Tag 2", "id": 345, "images": [], "inserted_at": "2021-01-25T02:27:10", "metadata": { "_color": "rgba(0,255,0,1.0)", "tag": {} }, "name": " Tag 2", "team_id": 2, "updated_at": "2021-01-25T02:27:10", }] } responses.add(responses.GET, darwin_client.url + endpoint, json=response, status=200) result: List[Dict[str, Any]] = darwin_client.fetch_remote_classes(team_slug) annotation_class: Dict[str, Any] = result[0] assert annotation_class["annotation_class_image_url"] is None assert annotation_class["annotation_types"] == ["tag"] assert annotation_class["dataset_id"] == 215 assert annotation_class["datasets"] == [{"id": 215}, {"id": 265}] assert annotation_class["id"] == 345
def _load_client(team: Optional[str] = None, offline: bool = False): """Fetches a client, potentially offline Parameters ---------- offline : bool Flag for using an offline client Returns ------- Client The client requested """ try: config_dir = Path.home() / ".darwin" / "config.yaml" client = Client.from_config(config_dir, team_slug=team) return client except MissingConfig: _error("Authenticate first") except InvalidLogin: _error("Please re-authenticate") except Unauthenticated: _error("Please re-authenticate")
def it_returns_list_of_features(darwin_client: Client): team_slug: str = "v7" endpoint: str = f"/teams/{team_slug}/features" json_response = [ { "enabled": False, "name": "WORKFLOW_V2" }, { "enabled": True, "name": "BLIND_STAGE" }, ] responses.add(responses.GET, darwin_client.url + endpoint, json=json_response, status=200) assert darwin_client.get_team_features(team_slug) == [ Feature(name="WORKFLOW_V2", enabled=False), Feature(name="BLIND_STAGE", enabled=True), ]
def get_darwin_dataset( *, team_slug: Optional[str] = None, dataset_slug: Optional[str] = None, dataset_id: Optional[str] = None, projects_dir: Optional[str] = None, token: Optional[str] = None, config_path: Optional[Path] = None, email: Optional[str] = None, password: Optional[str] = None, val_percentage: Optional[float] = 0.1, test_percentage: Optional[float] = 0.2, force_resplit: Optional[bool] = False, split_seed: Optional[int] = 42 ): """ Download a Darwin dataset on the file system. It is possible to select the way to authenticate and the configuration of the split of the dataset Parameters ---------- team_slug : str Slug of the team to select dataset_slug : str This is the dataset name with everything lower-case, removed specials characters and spaces are replaced by dashes, e.g., `bird-species`. This string is unique within a team projects_dir : Path Path where the client should be initialized from (aka the root path) token : str Access token used to auth a specific request. It has a time spans of roughly 8min. to config_path : str Path to a configuration file to use to create the client email : str Email of the Darwin user to use for the login password : str Password of the Darwin user to use for the login val_percentage : float Percentage of images used in the validation set test_percentage : float Percentage of images used in the test set force_resplit : bool Discard previous split and create a new one split_seed : in Fix seed for random split creation Returns ------- splits : dict Keys are the different splits (random, tags, ...) and values are the relative file names """ # Authenticate client. The priority of the cases is arbitrarily chosen and should actually not matter if email is not None and password is not None: client = Client.login(email=email, password=password, projects_dir=projects_dir) elif token is not None: client = Client.from_token(token=token, projects_dir=projects_dir) elif config_path is not None: client = Client.from_config(config_path=config_path) else: client = Client.default(projects_dir=projects_dir) # Select the desired team if team_slug is not None: client.set_team(slug=team_slug) # Get the remote dataset dataset = client.get_remote_dataset(slug=dataset_slug, dataset_id=dataset_id) # Download the data on the file system dataset.pull() # Split the dataset with the param required return split_dataset( dataset=dataset, val_percentage=val_percentage, test_percentage=test_percentage, force_resplit=force_resplit, split_seed=split_seed )
def get_annotations(dataset_slug, anno_dest_dir='annos', *, clear_directory=False, verbose=False): """ Get all annotations for a dataset dataset_name: name of the dataset to retrieve annotations for anno_dest_dir: directory to store the annotation files clear_directory: delete all existing files in target directory if they exist (if False raise an error if files exist) verbose: log API responses """ client = Client.from_api_key(API_KEY) identifier = DatasetIdentifier.parse(dataset_slug) dataset = client.get_remote_dataset(dataset_identifier=identifier) filters = {'statuses': 'review,complete'} ids = [file.id for file in dataset.fetch_remote_files(filters)] # darwin-py doesn't support dataset_item_ids # uses also /datasets/{self.dataset_id}/exports # dataset.export(annotation_class_ids=annotation_class_ids, name=name, include_url_token=include_url_token) export_name = 'export_tmp' print(ids) payload = dict( format='json', name=export_name, include_authorship=True, include_export_token=True, dataset_item_ids=ids ) print('Creating export...') response_create = requests.post(f'https://darwin.v7labs.com/api/teams/{TEAM_SLUG}/datasets/{dataset_slug}/exports', headers=HEADERS, json=payload) response_create.raise_for_status() if verbose: pprint.pprint(['create_export', response_create.json()]) def get_export(timeout=60): waiting_for_export = True timeout_stop = time.time() + timeout while waiting_for_export: response_retrieve = requests.get(f'https://darwin.v7labs.com/api/teams/{TEAM_SLUG}/datasets/{dataset_slug}/exports', headers=HEADERS) if verbose: pprint.pprint(['get_export', response_retrieve.json()]) response_retrieve.raise_for_status() exports = list(filter(lambda x: x['name'] == export_name, response_retrieve.json())) if len(exports) == 1 and exports[0]['latest']: return exports[0] else: if time.time() > timeout_stop: raise RuntimeError('Timeout whilst waiting for export to complete') time.sleep(0.5) if verbose: print('trying again...') try: print('Waiting for export to complete...') export = get_export() # download export data # (this is also available through dataset.annotations as a single dict? maybe deprecated?) print('Downloading annotations...') with requests.get(export['download_url'], stream=True) as r: r.raise_for_status() if verbose: pprint.pprint(['download_annos', r.status_code]) with mktmpdir() as tmp_dir: tmp_file = os.path.join(tmp_dir, 'export.zip') with open(tmp_file, 'wb') as f: for chunk in r.iter_content(chunk_size=8192): f.write(chunk) if os.path.exists(anno_dest_dir): anno_files = os.listdir(anno_dest_dir) if len(anno_files) > 0: if clear_directory: for file in anno_files: os.remove(os.path.join(anno_dest_dir, file)) else: raise RuntimeError('Directory already exists and contains files!') else: os.makedirs(anno_dest_dir) with ZipFile(tmp_file, 'r') as f: f.extractall(anno_dest_dir) anno_paths = [os.path.join(anno_dest_dir, x) for x in os.listdir(anno_dest_dir)] except Exception as e: response_delete = requests.delete(f'https://darwin.v7labs.com/api/teams/{TEAM_SLUG}/datasets/{dataset_slug}/exports/{export_name}', headers=HEADERS) response_delete.raise_for_status() if verbose: pprint.pprint(['delete_export', response_delete.status_code]) raise e print('Export completed, cleaning up...') response_delete = requests.delete(f'https://darwin.v7labs.com/api/teams/{TEAM_SLUG}/datasets/{dataset_slug}/exports/{export_name}', headers=HEADERS) response_delete.raise_for_status() del export['download_url'] export['annotation_paths'] = anno_paths return export