Exemplo n.º 1
0
    def push(
        self,
        files_to_upload: List[str],
        blocking: bool = True,
        multi_threaded: bool = True,
        fps: int = 1,
        as_frames: bool = False,
        files_to_exclude: Optional[List[str]] = None,
        resume: bool = False,
        path: Optional[str] = None,
    ):
        """Uploads a local dataset (images ONLY) in the datasets directory.

        Parameters
        ----------
        files_to_upload : list[Path]
            List of files to upload. It can be a folder.
        blocking : bool
            If False, the dataset is not uploaded and a generator function is returned instead
        multi_threaded : bool
            Uses multiprocessing to upload the dataset in parallel.
            If blocking is False this has no effect.
        files_to_exclude : list[str]
            List of files to exclude from the file scan (which is done only if files is None)
        fps : int
            Number of file per seconds to upload
        as_frames: bool
            Annotate as video.
        resume : bool
            Flag for signalling the resuming of a push
        path: str
            Optional path to put the files into

        Returns
        -------
        generator : function
            Generator for doing the actual uploads. This is None if blocking is True
        count : int
            The files count
        """

        # paths needs to start with /
        if path and path[0] != "/":
            path = f"/{path}"

        # This is where the responses from the upload function will be saved/load for resume
        self.local_path.parent.mkdir(exist_ok=True)
        responses_path = self.local_path.parent / ".upload_responses.json"
        # Init optional parameters
        if files_to_exclude is None:
            files_to_exclude = []
        if files_to_upload is None:
            raise NotFound("Dataset location not found. Check your path.")

        if resume:
            if not responses_path.exists():
                raise NotFound("Dataset location not found. Check your path.")
            with responses_path.open() as f:
                logged_responses = json.load(f)
            files_to_exclude.extend([
                response["file_path"] for response in logged_responses
                if response["s3_response_status_code"].startswith("2")
            ])

        files_to_upload = find_files(files=files_to_upload,
                                     recursive=True,
                                     files_to_exclude=files_to_exclude)

        if not files_to_upload:
            raise ValueError(
                "No files to upload, check your path, exclusion filters and resume flag"
            )

        progress, count = add_files_to_dataset(
            client=self.client,
            dataset_id=str(self.dataset_id),
            filenames=files_to_upload,
            fps=fps,
            as_frames=as_frames,
            team=self.team,
            path=path,
        )

        # If blocking is selected, upload the dataset remotely
        if blocking:
            responses = exhaust_generator(progress=progress,
                                          count=count,
                                          multi_threaded=multi_threaded)
            # Log responses to file
            if responses:
                responses = [{k: str(v)
                              for k, v in response.items()}
                             for response in responses]
                if resume:
                    responses.extend(logged_responses)
                with responses_path.open("w") as f:
                    json.dump(responses, f)
            return None, count
        else:
            return progress, count
Exemplo n.º 2
0
def upload_annotations(
    client: "Client",
    team: str,
    image_mapping: Path,
    annotations_path: Path,
    class_mapping: Path,
    dataset_id: Optional[int] = None,
    multi_threaded: bool = True,
):
    """Experimental feature to upload annotations from the front end

    Parameters
    ----------
    client: Client
        Client authenticated to the team where the put request will be made
    team: str
        Team against which the client will make the requests
    image_mapping: Path
        Path to the json file which contains the mapping between `original file names`
        and `dataset image id` which are required in the put request to compose the endpoint
    annotations_path: Path
        Path to the folder which contains all the json files representing the annotations to add
    class_mapping: Path
        Path to the json file which contains the mapping between `class name` and `class id` which
        is required in the put request to compose the payload. If not provided, new classes
        will be created
    dataset_id: int
        Dataset ID where to upload the annotations. This is required if class_mapping is None
        or if a class present in the annotations is missing on Darwin
    multi_threaded : bool
        Uses multiprocessing to upload the dataset in parallel.
    Notes
    -----
        This function is experimental and the json files `image_mapping` and `class_mapping` can
        actually only be retrieved from the backend at the moment.
    """
    # This is where the responses from the upload function will be saved/load for resume
    responses_path = image_mapping.parent / "upload_responses.json"
    output_file_path = image_mapping.parent / "log_requests.csv"

    # Read and prepare the image id mappings in a dict format {'original filename': 'image id'}
    with image_mapping.open() as json_file:
        image_mapping = {
            cm["original_filename"]: cm["id"]
            for cm in json.load(json_file)
        }

    # Read and prepare the class mappings in a dict format {'class name': 'class id'}
    if class_mapping is not None:
        with class_mapping.open() as json_file:
            class_mapping = {
                cm["name"]: cm["id"]
                for cm in json.load(json_file)
            }
    else:
        class_mapping = {}

    # Resume
    images_id = set()

    # Check that all the classes exists
    for f in annotations_path.glob("*.json"):
        with f.open() as json_file:
            # Read the annotation json file
            data = json.load(json_file)
            image_dataset_id = image_mapping[data["image"]
                                             ["original_filename"]]

            # Skip if already present
            if image_dataset_id in images_id:
                continue
            for annotation in data["annotations"]:
                # If the class is missing, create a new class on Darwin and update the mapping
                if not annotation["name"] in class_mapping:
                    if dataset_id is not None:
                        new_class = create_new_class(
                            client=client,
                            team=team,
                            annotation_type_ids=[
                                "3"
                            ],  # TODO maybe in the future allow to use polygons and BB as well
                            cropped_image={
                                "image_id": image_dataset_id,
                                "scale": 0.01,
                                "x": "0",
                                "y": "0"
                            },
                            dataset_id=dataset_id,
                            description="",
                            expected_occurrences=[0, 1],
                            metadata=None,
                            name=annotation["name"],
                        )
                        class_mapping[new_class["name"]] = new_class["id"]
                    else:
                        raise ValueError(
                            "Dataset ID is None and a class is missing on Darwin"
                            " (or in the provided mapping).")

    # For each annotation found in the folder send out a request
    files_to_upload = []
    for f in annotations_path.glob("*.json"):
        with f.open() as json_file:
            # Read the annotation json file
            data = json.load(json_file)
            image_dataset_id = image_mapping[data["image"]
                                             ["original_filename"]]
            # Skip if already present
            if image_dataset_id in images_id:
                continue
            files_to_upload.append({
                "data": data,
                "image_dataset_id": image_dataset_id
            })

    generator = (functools.partial(
        _upload_annotation,
        class_mapping=class_mapping,
        client=client,
        team=team,
        data=element["data"],
        image_dataset_id=element["image_dataset_id"],
        output_file_path=output_file_path,
    ) for element in files_to_upload)

    responses = exhaust_generator(progress=generator,
                                  count=len(files_to_upload),
                                  multi_threaded=multi_threaded)
    # Log responses to file
    if responses:
        components_labels = ["payload", "response"]
        responses = [{
            component_label: {k: str(v)
                              for k, v in component.items()}
            for response in responses
            for component, component_label in zip(response, components_labels)
        }]
        with responses_path.open("w") as f:
            json.dump(responses, f)
Exemplo n.º 3
0
    def pull(
        self,
        *,
        release: Optional[Release] = None,
        blocking: bool = True,
        multi_threaded: bool = True,
        only_annotations: bool = False,
        force_replace: bool = False,
        remove_extra: bool = False,
        subset_filter_annotations_function: Optional[Callable] = None,
        subset_folder_name: Optional[str] = None,
        use_folders: bool = False,
        video_frames: Optional[bool] = False,
    ):
        """Downloads a remote project (images and annotations) in the datasets directory.

        Parameters
        ----------
        release: Release
            The release to pull
        blocking : bool
            If False, the dataset is not downloaded and a generator function is returned instead
        multi_threaded : bool
            Uses multiprocessing to download the dataset in parallel. If blocking is False this has no effect.
        only_annotations: bool
            Download only the annotations and no corresponding images
        force_replace: bool
            Forces the re-download of an existing image
        remove_extra: bool
            Removes existing images for which there is not corresponding annotation
        subset_filter_annotations_function: Callable
            This function receives the directory where the annotations are downloaded and can
            perform any operation on them i.e. filtering them with custom rules or else.
            If it needs to receive other parameters is advised to use functools.partial() for it.
        subset_folder_name: str
            Name of the folder with the subset of the dataset. If not provided a timestamp is used.
        use_folders: bool
            Recreates folders from the dataset
        video_frames: bool
            Pulls video frames images instead of video files

        Returns
        -------
        generator : function
            Generator for doing the actual downloads. This is None if blocking is True
        count : int
            The files count
        """
        if release is None:
            release = self.get_release()

        if release.format != "json":
            raise UnsupportedExportFormat(release.format)

        release_dir = self.local_releases_path / release.name
        release_dir.mkdir(parents=True, exist_ok=True)

        with tempfile.TemporaryDirectory() as tmp_dir:
            tmp_dir = Path(tmp_dir)
            # Download the release from Darwin
            zip_file_path = release.download_zip(tmp_dir / "dataset.zip")
            with zipfile.ZipFile(zip_file_path) as z:
                # Extract annotations
                z.extractall(tmp_dir)
                # If a filtering function is provided, apply it
                if subset_filter_annotations_function is not None:
                    subset_filter_annotations_function(tmp_dir)
                    if subset_folder_name is None:
                        subset_folder_name = datetime.now().strftime(
                            "%m/%d/%Y_%H:%M:%S")
                annotations_dir = release_dir / (subset_folder_name
                                                 or "") / "annotations"
                # Remove existing annotations if necessary
                if annotations_dir.exists():
                    try:
                        shutil.rmtree(annotations_dir)
                    except PermissionError:
                        print(
                            f"Could not remove dataset in {annotations_dir}. Permission denied."
                        )
                annotations_dir.mkdir(parents=True, exist_ok=False)
                # Move the annotations into the right folder and rename them to have the image
                # original filename as contained in the json
                for annotation_path in tmp_dir.glob("*.json"):
                    with annotation_path.open() as file:
                        annotation = json.load(file)
                    filename = Path(annotation["image"]["filename"]).stem
                    destination_name = annotations_dir / f"{filename}{annotation_path.suffix}"
                    shutil.move(str(annotation_path), str(destination_name))

        # Extract the list of classes and create the text files
        make_class_lists(release_dir)

        if release.latest:
            latest_dir = self.local_releases_path / "latest"
            if latest_dir.is_symlink():
                latest_dir.unlink()
            latest_dir.symlink_to(f"./{release_dir.name}")

        if only_annotations:
            # No images will be downloaded
            return None, 0

        team_config = self.client.config.get_team(self.team)
        api_key = team_config.get("api_key")

        # Create the generator with the download instructions
        progress, count = download_all_images_from_annotations(
            api_key=api_key,
            api_url=self.client.url,
            annotations_path=annotations_dir,
            images_path=self.local_images_path,
            force_replace=force_replace,
            remove_extra=remove_extra,
            use_folders=use_folders,
            video_frames=video_frames,
        )
        if count == 0:
            return None, count

        # If blocking is selected, download the dataset on the file system
        if blocking:
            exhaust_generator(progress=progress(),
                              count=count,
                              multi_threaded=multi_threaded)
            return None, count
        else:
            return progress, count