예제 #1
0
    def featurize(self,
                  input_files,
                  labels=None,
                  weights=None,
                  in_memory=False):
        """Featurizes image files.

        Parameters
        ----------
        input_files: list
          Each file in this list should either be of a supported image format
          (.png, .tif only for now) or of a compressed folder of image files
          (only .zip for now).
        in_memory: bool
          If true, return in-memory NumpyDataset. Else return ImageDataset.
        """
        if not isinstance(input_files, list):
            input_files = [input_files]

        image_files = []
        # Sometimes zip files contain directories within. Traverse directories
        while len(input_files) > 0:
            remainder = []
            for input_file in input_files:
                filename, extension = os.path.splitext(input_file)
                extension = extension.lower()
                # TODO(rbharath): Add support for more extensions
                if os.path.isdir(input_file):
                    dirfiles = [
                        os.path.join(input_file, subfile)
                        for subfile in os.listdir(input_file)
                    ]
                    remainder += dirfiles
                elif extension == ".zip":
                    zip_dir = tempfile.mkdtemp()
                    zip_ref = zipfile.ZipFile(input_file, 'r')
                    zip_ref.extractall(path=zip_dir)
                    zip_ref.close()
                    zip_files = [
                        os.path.join(zip_dir, name)
                        for name in zip_ref.namelist()
                    ]
                    for zip_file in zip_files:
                        _, extension = os.path.splitext(zip_file)
                        extension = extension.lower()
                        if extension in [".png", ".tif"]:
                            image_files.append(zip_file)
                elif extension in [".png", ".tif"]:
                    image_files.append(input_file)
                else:
                    raise ValueError("Unsupported file format")
            input_files = remainder

        if in_memory:
            return NumpyDataset(self.load_img(image_files),
                                y=labels,
                                w=weights,
                                ids=image_files)
        else:
            return ImageDataset(image_files,
                                y=labels,
                                w=weights,
                                ids=image_files)
예제 #2
0
    def create_dataset(self,
                       inputs: Union[OneOrMany[str], Tuple[Any]],
                       data_dir: Optional[str] = None,
                       shard_size: Optional[int] = 8192,
                       in_memory: bool = False) -> Dataset:
        """Creates and returns a `Dataset` object by featurizing provided image files and labels/weights.

    Parameters
    ----------
    inputs: `Union[OneOrMany[str], Tuple[Any]]`
      The inputs provided should be one of the following

        - filename
        - list of filenames
        - Tuple (list of filenames, labels)
        - Tuple (list of filenames, labels, weights)

      Each file in a given list of filenames should either be of a supported
      image format (.png, .tif only for now) or of a compressed folder of
      image files (only .zip for now). If `labels` or `weights` are provided,
      they must correspond to the sorted order of all filenames provided, with
      one label/weight per file.
    data_dir: str, optional (default None)
      Directory to store featurized dataset.
    shard_size: int, optional (default 8192)
      Shard size when loading data.
    in_memory: bool, optioanl (default False)
      If true, return in-memory NumpyDataset. Else return ImageDataset.

    Returns
    -------
    Dataset
      A `Dataset` object containing a featurized representation of data
      from `input_files`, `labels`, and `weights`.
    """
        labels, weights = None, None
        if isinstance(inputs, tuple):
            if len(inputs) == 1:
                input_files = inputs[0]
                if isinstance(inputs, str):
                    input_files = [inputs]
            elif len(inputs) == 2:
                input_files, labels = inputs
            elif len(inputs) == 3:
                input_files, labels, weights = inputs
            else:
                raise ValueError("Input must be a tuple of length 1, 2, or 3")
        else:
            input_files = inputs
        if isinstance(input_files, str):
            input_files = [input_files]

        image_files = []
        # Sometimes zip files contain directories within. Traverse directories
        while len(input_files) > 0:
            remainder = []
            for input_file in input_files:
                filename, extension = os.path.splitext(input_file)
                extension = extension.lower()
                # TODO(rbharath): Add support for more extensions
                if os.path.isdir(input_file):
                    dirfiles = [
                        os.path.join(input_file, subfile)
                        for subfile in os.listdir(input_file)
                    ]
                    remainder += dirfiles
                elif extension == ".zip":
                    zip_dir = tempfile.mkdtemp()
                    zip_ref = zipfile.ZipFile(input_file, 'r')
                    zip_ref.extractall(path=zip_dir)
                    zip_ref.close()
                    zip_files = [
                        os.path.join(zip_dir, name)
                        for name in zip_ref.namelist()
                    ]
                    for zip_file in zip_files:
                        _, extension = os.path.splitext(zip_file)
                        extension = extension.lower()
                        if extension in [".png", ".tif"]:
                            image_files.append(zip_file)
                elif extension in [".png", ".tif"]:
                    image_files.append(input_file)
                else:
                    raise ValueError("Unsupported file format")
            input_files = remainder

        # Sort image files
        image_files = sorted(image_files)

        if in_memory:
            if data_dir is None:
                return NumpyDataset(load_image_files(image_files),
                                    y=labels,
                                    w=weights,
                                    ids=image_files)
            else:
                dataset = DiskDataset.from_numpy(load_image_files(image_files),
                                                 y=labels,
                                                 w=weights,
                                                 ids=image_files,
                                                 tasks=self.tasks,
                                                 data_dir=data_dir)
                if shard_size is not None:
                    dataset.reshard(shard_size)
                return dataset
        else:
            return ImageDataset(image_files,
                                y=labels,
                                w=weights,
                                ids=image_files)