def __init__(self, root: Path, split: Path, transform: Optional[List] = None): """ Creates a dataset Parameters ---------- root : Path Path to the location of the dataset on the file system split : Path Path to the *.txt file containing the list of files for this split. transform : list[torchvision.transforms] List of PyTorch transforms """ self.root = root self.split = split self.transform = transform self.images_path: List[Path] = [] self.annotations_path: List[Path] = [] self.classes = None self.original_classes = None self.original_images_path: Optional[List[Path]] = None self.original_annotations_path: Optional[List[Path]] = None self.convert_polygons: Optional[Callable] = None # Compose the transform if necessary if self.transform is not None and isinstance(self.transform, list): self.transform = Compose(transform) # Populate internal lists of annotations and images paths if not self.split.exists(): raise FileNotFoundError( f"Could not find partition file: {self.split}") stems = (e.strip() for e in split.open()) image_extensions_mapping = { image.stem: image.suffix for image in self.root.glob(f"images/*") if is_image_extension_allowed(image.suffix) } for stem in stems: annotation_path = self.root / f"annotations/{stem}.json" try: extension = image_extensions_mapping[stem] except KeyError: raise ValueError( f"Annotation ({annotation_path}) does not have a corresponding image" ) image_path = self.root / f"images/{stem}{extension}" self.images_path.append(image_path) self.annotations_path.append(annotation_path) if len(self.images_path) == 0: raise ValueError( f"Could not find any {SUPPORTED_IMAGE_EXTENSIONS} file" f" in {self.root / 'images'}") assert len(self.images_path) == len(self.annotations_path)
def sign_upload(client: "Client", image_id: int, key: str, file_path: Path, team: str): """Obtains the signed URL from the back so that we can update to the AWS without credentials Parameters ---------- client: Client Client authenticated to the team where the put request will be made image_id: int Id of the image to upload key: str Path in the s3 bucket file_path: Path Path to the file to upload on the file system Returns ------- dict Dictionary which contains the server response """ file_format = file_path.suffix if is_image_extension_allowed(file_format): return client.post( endpoint=f"/dataset_images/{image_id}/sign_upload?key={key}", payload={ "filePath": str(file_path), "contentType": f"image/{file_format}" }, team=team, ) elif is_video_extension_allowed(file_format): return client.post( endpoint=f"/dataset_videos/{image_id}/sign_upload?key={key}", payload={ "filePath": str(file_path), "contentType": f"video/{file_format}" }, team=team, )
def _split_on_file_type(files: List[Path]): """Splits a single list of files into images and videos based on their extension Parameters ---------- files : list[Path] List of files to split according to their type Returns ------- images, videos : list[Path] List of image and videos, respectively """ images = [] videos = [] for file_path in files: suffix = file_path.suffix if is_image_extension_allowed(suffix): images.append(file_path) elif is_video_extension_allowed(suffix): videos.append(file_path) else: raise UnsupportedFileType(file_path) return images, videos
def download_all_images_from_annotations( api_key: str, api_url: str, annotations_path: Path, images_path: Path, force_replace: bool = False, remove_extra: bool = False, annotation_format: str = "json", use_folders: bool = False, video_frames: bool = False, ) -> Tuple[Callable[[], Iterator[Any]], int]: """Helper function: downloads the all images corresponding to a project. Parameters ---------- api_key : str API Key of the current team api_url : str Url of the darwin API (e.g. 'https://darwin.v7labs.com/api/') annotations_path : Path Path where the annotations are located images_path : Path Path where to download the images force_replace: bool Forces the re-download of an existing image remove_extra: bool Removes existing images for which there is not corresponding annotation annotation_format : str Format of the annotations. Currently only JSON and xml are expected use_folders: bool Recreate folders video_frames: bool Pulls video frames images instead of video files Returns ------- generator : function Generator for doing the actual downloads, count : int The files count """ Path(images_path).mkdir(exist_ok=True) if annotation_format not in ["json", "xml"]: raise ValueError( f"Annotation format {annotation_format} not supported") # Verify that there is not already image in the images folder unfiltered_files = images_path.rglob( f"*") if use_folders else images_path.glob(f"*") existing_images = { image.stem: image for image in unfiltered_files if is_image_extension_allowed(image.suffix) } annotations_to_download_path = [] for annotation_path in annotations_path.glob(f"*.{annotation_format}"): with annotation_path.open() as file: annotation = json.load(file) if not force_replace: # Check collisions on image filename, original_filename and json filename on the system if sanitize_filename(Path( annotation["image"]["filename"]).stem) in existing_images: continue if sanitize_filename( Path(annotation["image"] ["original_filename"]).stem) in existing_images: continue if sanitize_filename(annotation_path.stem) in existing_images: continue annotations_to_download_path.append(annotation_path) if remove_extra: # Removes existing images for which there is not corresponding annotation annotations_downloaded_stem = [ a.stem for a in annotations_path.glob(f"*.{annotation_format}") ] for existing_image in existing_images.values(): if existing_image.stem not in annotations_downloaded_stem: print( f"Removing {existing_image} as there is no corresponding annotation" ) existing_image.unlink() # Create the generator with the partial functions count = len(annotations_to_download_path) generator = lambda: (functools.partial( download_image_from_annotation, api_key, api_url, annotation_path, images_path, annotation_format, use_folders, video_frames, ) for annotation_path in annotations_to_download_path) return generator, count
def it_returns_false_for_unknown_extensions(): assert not is_image_extension_allowed(".not_an_image")
def it_returns_true_for_allowed_extensions(): assert is_image_extension_allowed(".png")
def get_annotations( dataset, partition: str, split: str = "split", split_type: str = "stratified", annotation_type: str = "polygon", ): """ Returns all the annotations of a given dataset and split in a single dictionary Parameters ---------- dataset Path to the location of the dataset on the file system partition Selects one of the partitions [train, val, test] split Selects the split that defines the percetages used (use 'split' to select the default split split_type Heuristic used to do the split [random, stratified] annotation_type The type of annotation classes [tag, polygon] Returns ------- dict Dictionary containing all the annotations of the dataset """ assert dataset is not None if isinstance(dataset, Path) or isinstance(dataset, str): dataset_path = Path(dataset) else: dataset_path = dataset.local_path if partition not in ["train", "val", "test"]: raise ValueError( "partition should be either 'train', 'val', or 'test'") if split_type not in ["random", "stratified"]: raise ValueError( "split_type should be either 'random' or 'stratified'") if annotation_type not in ["tag", "polygon"]: raise ValueError("annotation_type should be either 'tag' or 'polygon'") # Get the list of classes classes = get_classes(dataset, annotation_type=annotation_type, remove_background=True) # Get the split if split_type == "random": split_file = f"{split_type}_{partition}.txt" elif split_type == "stratified": split_file = f"{split_type}_{annotation_type}_{partition}.txt" split_path = dataset_path / "lists" / split / split_file stems = (e.strip() for e in split_path.open()) images_path = [] annotations_path = [] # Find all the annotations and their corresponding images for stem in stems: annotation_path = dataset_path / f"annotations/{stem}.json" images = [ image for image in dataset_path.glob(f"images/{stem}.*") if is_image_extension_allowed(image.suffix) ] if len(images) < 1: raise ValueError(f"Annotation ({annotation_path}) does" f" not have a corresponding image") if len(images) > 1: raise ValueError( f"Image ({stem}) is present with multiple extensions." f" This is forbidden.") assert len(images) == 1 image_path = images[0] images_path.append(image_path) annotations_path.append(annotation_path) if len(images_path) == 0: raise ValueError( f"Could not find any {SUPPORTED_IMAGE_EXTENSIONS} file" f" in {dataset_path / 'images'}") assert len(images_path) == len(annotations_path) try: from detectron2.structures import BoxMode except ImportError: BoxMode = None # Load and re-format all the annotations dataset_dicts = [] for image_id, (im_path, annot_path) in enumerate(zip(images_path, annotations_path)): record = {} with annot_path.open() as f: data = json.load(f) height, width = data["image"]["height"], data["image"]["width"] annotations = data["annotations"] filename = im_path record["file_name"] = str(filename) record["height"] = height record["width"] = width record["image_id"] = image_id objs = [] for obj in annotations: px, py = [], [] if "polygon" not in obj: continue for point in obj["polygon"]["path"]: px.append(point["x"]) py.append(point["y"]) poly = [(x, y) for x, y in zip(px, py)] if len(poly) < 3: # Discard polyhons with less than 3 points continue poly = list(itertools.chain.from_iterable(poly)) category_id = classes.index(obj["name"]) if BoxMode is not None: box_mode = BoxMode.XYXY_ABS else: box_mode = 0 obj = { "bbox": [np.min(px), np.min(py), np.max(px), np.max(py)], "bbox_mode": box_mode, "segmentation": [poly], "category_id": category_id, "iscrowd": 0, } objs.append(obj) record["annotations"] = objs dataset_dicts.append(record) return dataset_dicts