示例#1
0
    def __init__(self, data_dir=None, **kwargs):
        if data_dir:
            self._data_dir = data_dir
        else:
            self._data_dir = FileIO.pupyl_temp_data_dir()

        self._index_config_path = os.path.join(self._data_dir, 'index.json')

        configurations = self._index_configuration('r')

        if configurations:
            self._import_images = configurations['import_images']
            self._characteristic = Characteristics.by_name(
                configurations['characteristic'])

            if configurations.get('feature_size'):
                self._feature_size = configurations['feature_size']
        else:
            import_images = kwargs.get('import_images')
            characteristic = kwargs.get('characteristic')

            if import_images:
                self._import_images = import_images
            else:
                self._import_images = True

            if characteristic:
                self._characteristic = characteristic
            else:
                self._characteristic = Characteristics.\
                    HEAVYWEIGHT_HUGE_PRECISION

        self.image_database = ImageDatabase(import_images=self._import_images,
                                            data_dir=self._data_dir)
示例#2
0
    def __init__(self, size, data_dir=None, trees=.001, volatile=False):
        """
        Indexing tensors operations and nearest neighbours search.

        Parameters
        ----------
        size: int
            Shape of unidimensional vectors which will be indexed

        data_dir: str
            Location where to load or save the index

        trees (optional): float
            Defines the number of trees to create based on the dataset
            size. Should be a number between 0 and 1.

        volatile (optional): bool
            If the index will be temporary or not.
        """
        self._position = -1
        self._size = size
        self._data_dir = data_dir
        self._trees = trees
        self._volatile = volatile

        if self._data_dir and not self._volatile:
            if os.path.isfile(self._data_dir):
                raise OSError('data_dir parameter is not a directory')

            os.makedirs(self._data_dir, exist_ok=True)
            self._path = os.path.join(self._data_dir, self.index_name)
        elif not self._data_dir and not self._volatile:
            raise NoDataDirForPermanentIndex
        elif not self._data_dir and self._volatile:
            _temp_file = FileIO.safe_temp_file()
            self._data_dir = os.path.dirname(_temp_file)
            self._path = _temp_file

        else:
            raise DataDirDefinedForVolatileIndex

        if os.path.isfile(self._path):
            try:
                self.tree = AnnoyIndex(size, metric='angular')

                self.tree.load(self._path)

                self._is_new_index = False
            except OSError as os_error:
                raise FileIsNotAnIndex from os_error
        else:
            self.tree = AnnoyIndex(size, metric='angular')
            self._is_new_index = True

        self._image_database = ImageDatabase(
            import_images=True,
            data_dir=self._data_dir,
        )
示例#3
0
    def test_load_image_metadata_not_found(self):
        """Unit test for method load_image_metadata, index not found case."""
        with self.assertRaises(IndexError):
            image_database = ImageDatabase(
                import_images=True,
                directory=TEST_DIRECTORY
            )

            _ = image_database.load_image_metadata(999)
示例#4
0
def test_image_size_property():
    """Unit test for property image_size."""
    image_database = ImageDatabase(
        data_dir=TEST_DIRECTORY,
        import_images=True
    )

    test_image_size = (320, 200)
    image_database.image_size = test_image_size
    assert image_database.image_size == test_image_size
示例#5
0
def test_bucket_size_property():
    """Unit test for property bucket_size."""
    image_database = ImageDatabase(
        import_images=True,
        data_dir=TEST_DIRECTORY
    )

    test_bucket_size = 999
    image_database.bucket_size = test_bucket_size
    assert image_database.bucket_size == test_bucket_size
示例#6
0
def test_insert_no_import_images():
    """Unit test for method insert, no import images case."""
    image_database = ImageDatabase(
        data_dir=TEST_TEMP_DIRECTORY,
        import_images=False
    )

    image_database.insert(20, 'tests/test_image.jpg')

    assert not exists(f'{TEST_TEMP_DIRECTORY}/0/20.jpg') and \
        exists(f'{TEST_TEMP_DIRECTORY}/0/20.json')
示例#7
0
def test_insert_import_images():
    """Unit test for method insert, import images case."""
    image_database = ImageDatabase(
        data_dir=TEST_TEMP_DIRECTORY,
        import_images=True
    )

    image_database.insert(10, 'tests/test_image.jpg')

    assert exists(f'{TEST_TEMP_DIRECTORY}/0/10.jpg') \
        and exists(f'{TEST_TEMP_DIRECTORY}/0/10.json')
示例#8
0
def test_what_bucket():
    """Unit test for method what_bucket."""
    test_bucket_size = 10 ** 4

    image_database = ImageDatabase(
        import_images=True,
        bucket_size=test_bucket_size
    )

    assert image_database.what_bucket(TEST_INDEX) == \
        TEST_INDEX // test_bucket_size
示例#9
0
def test_list_images():
    """Unit test for method list_images."""
    expected_result = abspath('tests/test_database/0/0.jpg')

    image_database = ImageDatabase(
        import_images=True,
        data_dir=TEST_DIRECTORY
    )

    actual_result = [*image_database.list_images()][0]

    assert expected_result == actual_result
示例#10
0
def test_list_images_less_than_count():
    """Unit test for method list_images, less than counter case."""
    expected_length = -1

    image_database = ImageDatabase(
        import_images=True,
        data_dir=TEST_DIRECTORY
    )

    actual_result = [*image_database.list_images(top=expected_length)]

    assert len(actual_result) == 0
示例#11
0
def test_load_image_metadata_filtered():
    """Unit test for method load_image_metadata, filtered input case."""
    image_database = ImageDatabase(
        import_images=True,
        data_dir=TEST_DIRECTORY
    )

    test_filter = ('id', 'original_file_size')

    test_metadata = image_database.load_image_metadata(0, filtered=test_filter)

    for key in test_metadata:
        assert key in test_filter
示例#12
0
def test_import_images_property():
    """Unit test for property import_images."""
    image_database = ImageDatabase(
        import_images=True,
        data_dir=TEST_DIRECTORY
    )

    test_import_images = True
    image_database.import_images = test_import_images
    assert image_database.import_images

    test_import_images = not test_import_images
    image_database.import_images = test_import_images
    assert not image_database.import_images
示例#13
0
def test_mount_file_name():
    """Unit test for method what_bucket."""
    image_database = ImageDatabase(
        import_images=True,
        data_dir=TEST_DIRECTORY
    )

    expected_path = join(
        TEST_DIRECTORY,
        str(image_database.what_bucket(TEST_INDEX)),
        f'{TEST_INDEX}.json'
    )

    assert image_database.mount_file_name(TEST_INDEX, 'json') == expected_path
示例#14
0
def test_save_image_metadata():
    """Unit test for method save_image_metadata."""
    image_database = ImageDatabase(
        import_images=True,
        data_dir=TEST_DIRECTORY
    )

    image_database.save_image_metadata(0, TEST_IMAGE)

    test_metadata = image_database.load_image_metadata(0)

    del test_metadata['original_access_time']

    test_metadata['original_path'] = relpath(test_metadata['original_path'])

    assert test_metadata == TEST_METADATA
示例#15
0
def test___len__():
    """Unit test for __len__ method."""
    image_database = ImageDatabase(
        import_images=True,
        data_dir=TEST_DIRECTORY
    )

    assert len(image_database) == 1
示例#16
0
def test_image_size_property_no_import_images():
    """Unit test for property image_size, no import images case."""
    image_database = ImageDatabase(
        data_dir=TEST_DIRECTORY,
        import_images=False
    )

    test_image_size = (800, 600)
    assert image_database.image_size == test_image_size
示例#17
0
def test_load_image():
    """Unit test for method load_image."""
    test_index = 0

    image_database = ImageDatabase(
        import_images=True,
        data_dir=TEST_DIRECTORY
    )

    def inst_load_image(index):
        """Closure for method load_image."""
        return image_database.get_image(
            image_database.mount_file_name(index, 'jpg')
        )

    test_image = image_database.load_image(test_index)

    assert test_image == inst_load_image(test_index)
示例#18
0
    def test___get_item___not_found(self):
        """Unit test for method __get_item__, index not found case."""
        with self.assertRaises(IndexError):
            image_database = ImageDatabase(
                import_images=True,
                data_dir=TEST_DIRECTORY
            )

            _ = image_database[999]
示例#19
0
def test_list_images_return_index():
    """Unit test for method list_images, return index case."""
    expected_path = abspath('tests/test_database/0/0.jpg')
    expected_index = 0
    expected_result = (expected_index, expected_path)

    image_database = ImageDatabase(
        import_images=True,
        data_dir=TEST_DIRECTORY
    )

    actual_index_path = [
        *image_database.list_images(
            return_index=True
        )
    ][0]

    assert actual_index_path == expected_result
示例#20
0
def test_load_image_metadata():
    """Unit test for method load_image_metadata."""
    image_database = ImageDatabase(
        import_images=True,
        data_dir=TEST_DIRECTORY
    )

    test_metadata = image_database.load_image_metadata(0)

    del test_metadata['original_access_time']

    test_metadata = resolve_original_path(test_metadata)

    assert test_metadata == TEST_METADATA

    test_metadata = image_database.load_image_metadata(1)

    del test_metadata['original_access_time']

    test_metadata = resolve_original_path(test_metadata)

    assert test_metadata == TEST_METADATA_HTTP
示例#21
0
def test_image_database_definition():
    """Unit test for instance definition for class ImageDatabase."""
    test_import_images = True
    test_bucket_size = 100
    test_image_size = (640, 480)

    image_database = ImageDatabase(
        import_images=test_import_images,
        data_dir=TEST_TEMP_DIRECTORY,
        bucket_size=test_bucket_size,
        image_size=test_image_size
    )

    assert isinstance(image_database, ImageDatabase)
    assert image_database.import_images == test_import_images
    assert image_database._data_dir == TEST_TEMP_DIRECTORY
    assert image_database.bucket_size == test_bucket_size
    assert image_database.image_size == test_image_size
示例#22
0
def test___get_item__():
    """Unit test for __get_item__ method."""

    image_database = ImageDatabase(
        import_images=True,
        data_dir=TEST_DIRECTORY
    )

    test_metadata = image_database[0]
    del test_metadata['original_access_time']

    test_metadata = resolve_original_path(test_metadata)

    assert test_metadata == TEST_METADATA

    test_metadata = image_database[1]
    del test_metadata['original_access_time']

    test_metadata = resolve_original_path(test_metadata)

    assert test_metadata == TEST_METADATA_HTTP
示例#23
0
class Index:
    """Procedures over multidimensional spaces."""

    def __init__(self, size, data_dir=None, trees=.001, volatile=False):
        """
        Indexing tensors operations and nearest neighbours search.

        Parameters
        ----------
        size: int
            Shape of unidimensional vectors which will be indexed

        data_dir: str
            Location where to load or save the index

        trees (optional): float
            Defines the number of trees to create based on the dataset
            size. Should be a number between 0 and 1.

        volatile (optional): bool
            If the index will be temporary or not.
        """
        self._position = -1
        self._size = size
        self._data_dir = data_dir
        self._trees = trees
        self._volatile = volatile

        if self._data_dir and not self._volatile:
            if os.path.isfile(self._data_dir):
                raise OSError('data_dir parameter is not a directory')

            os.makedirs(self._data_dir, exist_ok=True)
            self._path = os.path.join(self._data_dir, self.index_name)
        elif not self._data_dir and not self._volatile:
            raise NoDataDirForPermanentIndex
        elif not self._data_dir and self._volatile:
            _temp_file = FileIO.safe_temp_file()
            self._data_dir = os.path.dirname(_temp_file)
            self._path = _temp_file

        else:
            raise DataDirDefinedForVolatileIndex

        if os.path.isfile(self._path):
            try:
                self.tree = AnnoyIndex(size, metric='angular')

                self.tree.load(self._path)

                self._is_new_index = False
            except OSError as os_error:
                raise FileIsNotAnIndex from os_error
        else:
            self.tree = AnnoyIndex(size, metric='angular')
            self._is_new_index = True

        self._image_database = ImageDatabase(
            import_images=True,
            data_dir=self._data_dir,
        )

    @property
    def size(self):
        """Getter for property size."""
        return self._size

    @property
    def path(self):
        """Getter for property path."""
        return self._path

    @property
    def index_name(self):
        """Getter for property index_name."""
        return 'pupyl.index'

    @property
    def trees(self):
        """Getter for property trees."""
        return self._trees

    @property
    def volatile(self):
        """Getter for property volatile."""
        return self._volatile

    @trees.setter
    def trees(self, trees):
        """Setter for property trees."""
        self._trees = trees

    def __enter__(self):
        """Context opening index."""
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        """Context closing index."""
        if not exc_type:

            if self._is_new_index:
                self.tree.build(self.size << intmul >> self.trees)

                self.tree.save(self.path)

            self.tree.unload()

    def items(self):
        """Return the indexed items."""
        for item in range(len(self)):
            yield item

    def values(self):
        """Return the indexed values."""
        for item in self.items():
            yield self.tree.get_item_vector(item)

    def items_values(self):
        """Return tuples with all items and values."""
        for item, value in zip(self.items(), self.values()):
            yield item, value

    def __getitem__(self, position):
        """Return item at index. Supports negative slicing."""
        if position >= 0:
            return self.tree.get_item_vector(position)

        return self.tree.get_item_vector(
            len(self) - abs(position)
        )

    def refresh(self):
        """Update all information regarding index file."""
        self.tree.unload()
        self.tree.load(self.path)

    def append(self, tensor, check_unique=False):
        """
        Insert a new tensor at the end of the index.
        Be advised that this operation is linear on index size ($O(n)$).

        Parameters
        ----------
        tensor: numpy.ndarray or list
            A vector to insert into index.

        check_unique (optional, default: False): bool
            Defines if append method should verify the existence
            of a really similar tensor on the current index. In other words,
            it checks for the unicity of the value. Be advised that this check
            creates an overhead on the append process.
        """
        if sum(tensor) == 0.:
            raise NullTensorError

        if self._is_new_index:

            index_it = True

            if check_unique and len(self) > 1:

                self.tree.build(self.size << intmul >> self.trees)

                result = self.item(
                    self.index(tensor),
                    top=1,
                    distances=True
                )

                if result[1][0] <= .05:
                    warning(
                        'Tensor being indexed already exists in '
                        'the database and the check for duplicates '
                        'are on. Refusing to store again this tensor.'
                    )

                    index_it = False

                self.tree.unbuild()

            if index_it:
                self.tree.add_item(len(self), tensor)

        else:

            with Index(self.size, volatile=True, trees=self.trees) as tmp_idx:
                for value in self.values():
                    tmp_idx.append(value, check_unique)

                tmp_idx.append(tensor, check_unique)

                _temp_file = tmp_idx.path

            move(_temp_file, self.path)

            self.refresh()

    def remove(self, position):
        """
        Remove the tensor at index from the database.
        Be advised that this operation is linear on index size ($O(n)$).

        Parameters
        ----------
        position: int
            The index which must be removed
        """
        if self._is_new_index:
            raise IndexNotBuildYet

        if position > len(self):
            raise IndexError

        with Index(self.size, volatile=True, trees=self.trees) as tmp_idx:
            shrink = False

            for item, value in self.items_values():
                if item == position:
                    shrink = True
                else:
                    if shrink:
                        item -= 1

                    tmp_idx.tree.add_item(item, value)

            _temp_file = tmp_idx.path

        move(_temp_file, self.path)

        self.refresh()

    def pop(self, position=None):
        """
        Pop-out the index at position, returning it.
        Be advised that this operation is linear on index size ($O(n)$).

        Parameters
        ----------
        position (optional) (default: last position): int
            Removes and returns the value at position.

        Returns
        ----------
        int:
            With the popped item.
        """
        if position:
            value = self[position]
        else:
            inverse_index = -1
            value = self[inverse_index]
            position = len(self) + inverse_index

        self.remove(position)

        return value

    def index(self, tensor):
        """
        Search for the first most similar image compared to the query.

        Parameters
        ----------
        tensor: numpy.ndarray or list
            A vector to search for the most similar.

        Returns
        ----------
        int:
            Describing the most similar resulting index.
        """
        return self.tree.get_nns_by_vector(tensor, n=1)[0]

    def item(self, position, top=10, distances=False):
        """
        Search the index using an internal position

        Parameters
        ----------
        position: int
            The item id within index.

        top (optional, default 10): int
            How many similar items should be returned.

        distances (optional, default 10): bool
            If should be returned also the distances between
            items.

        Returns
        -------
        if distances is True:
            list of tuples:
                Containing pairs of item and distances
        else:
            list:
                Containing similar items.
        """
        return self.tree.get_nns_by_item(
            position,
            top,
            include_distances=distances
        )

    def search(self, tensor, results=16):
        """
        Search for the first most similars image compared to the query.

        Parameters
        ----------
        tensor: numpy.ndarray or list
            A vector to search for the most similar images.

        results: int
            How many results to return. If similar images are less than
            results, it exhausts and will be returned actual total.
        """
        for result in self.tree.get_nns_by_vector(tensor, n=results):
            yield result

    def __len__(self):
        """Return how many items are indexed."""
        return self.tree.get_n_items()

    def __iter__(self):
        """Return an iterable."""
        for value in self.values():
            yield value

    def __next__(self):
        """Iterate over the iterable."""
        self._position += 1

        all_values = list(self.values())

        if self._position < len(all_values):
            return all_values[self._position]

        raise StopIteration

    def group_by(self, top=10, **kwargs):
        """
        Returns all (or some position) on the index that is similar
        with other elements inside index.

        Parameters
        ----------
        top (optional, default 10): int
            How many similar internal images should be returned

        position (optional): int
            Returns the groups based on a specified position.

        Returns
        -------
        list:
            If a position is defined

        or

        dict:
            Generator with a dictionary containing internal ids
            as key and a list of similar images as values.
        """
        position = kwargs.get('position')

        if len(self) <= 1:
            raise EmptyIndexError

        if top >= 1:
            if isinstance(position, int):

                results = self.item(position, top + 1)

                if len(results) > 1:

                    yield results[1:]

            else:

                for item in self.items():

                    yield {
                        item: self.item(
                            item,
                            top + 1
                        )[1:]
                    }
        else:

            raise TopNegativeOrZero

    def export_by_group_by(self, path, top=10, **kwargs):
        """
        Saves images, creating directories, based on their groups.

        Parameters
        ----------
        path: str
            Place to create the directories and export images

        top (optional, default 10):
            How many similar internal images should be returned

        position (optional): int
            Returns the groups based on a specified position.
        """
        for element in FileIO.progress(
            self.group_by(
                top=top,
                position=kwargs.get('position')
            )
        ):
            if isinstance(element, dict):
                item = [*element.keys()][0]
                similars = element[item]
            elif isinstance(element, list):
                item = kwargs['position']
                similars = element

            save_path = os.path.join(
                path,
                str(item)
            )

            os.makedirs(
                save_path,
                exist_ok=True
            )

            try:
                copyfile(
                    self._image_database.mount_file_name(
                        item,
                        'jpg'
                    ),
                    os.path.join(
                        save_path,
                        'group.jpg'
                    )
                )
            except FileNotFoundError:
                continue

            for rank, similar in enumerate(similars):

                original_file_path = self._image_database.mount_file_name(
                    similar,
                    'jpg'
                )

                try:
                    copyfile(
                        original_file_path,
                        os.path.join(
                            save_path,
                            f'{rank + 1}.jpg'
                        )
                    )
                except FileNotFoundError:
                    continue
示例#24
0
class PupylImageSearch:
    """
    Encapsulates every aspect of pupyl, from feature extraction
    to indexing and image database.
    """
    def __init__(self, data_dir=None, **kwargs):
        if data_dir:
            self._data_dir = data_dir
        else:
            self._data_dir = FileIO.pupyl_temp_data_dir()

        self._index_config_path = os.path.join(self._data_dir, 'index.json')

        configurations = self._index_configuration('r')

        if configurations:
            self._import_images = configurations['import_images']
            self._characteristic = Characteristics.by_name(
                configurations['characteristic'])

            if configurations.get('feature_size'):
                self._feature_size = configurations['feature_size']
        else:
            import_images = kwargs.get('import_images')
            characteristic = kwargs.get('characteristic')

            if import_images:
                self._import_images = import_images
            else:
                self._import_images = True

            if characteristic:
                self._characteristic = characteristic
            else:
                self._characteristic = Characteristics.\
                    HEAVYWEIGHT_HUGE_PRECISION

        self.image_database = ImageDatabase(import_images=self._import_images,
                                            data_dir=self._data_dir)

    def _index_configuration(self, mode, **kwargs):
        """
        Load or save an index configuration file, if exists.

        Parameters
        ----------
        mode (values: ('r', 'w')): str
            Defines which mode should be used over configuration
            file. 'r' is for file reading, 'w' for writing.

        feature_size(optional): int
            The size of current feature extraction method.
        """
        try:
            with open(self._index_config_path, mode) as config_file:
                if mode == 'r':

                    return json.load(config_file)

                if mode == 'w':
                    feature_size = kwargs.get('feature_size')

                    configurations = {
                        'import_images': self._import_images,
                        'characteristic': self._characteristic.name,
                    }

                    if feature_size:
                        configurations['feature_size'] = feature_size

                    json.dump(configurations, config_file)

                return True
        except FileNotFoundError:
            return False

    def index(self, uri, **kwargs):
        """
        Performs image indexing.

        Parameters
        ----------
        uri: str
            Directory or file, or http(s) location.

        **check_unique (optional): bool
            If, during the index process, imported images
            should have their unicity verified (to avoid duplicates).
        """
        with Extractors(
                characteristics=self._characteristic) as extractor, Index(
                    extractor.output_shape, data_dir=self._data_dir) as index:

            self._index_configuration('w', feature_size=extractor.output_shape)

            with concurrent.futures.ThreadPoolExecutor() as executor:
                futures = {
                    executor.submit(self.image_database.insert, rank,
                                    uri_from_file): rank
                    for rank, uri_from_file in enumerate(
                        extractor.scan_images(uri))
                }

                ranks = []

                for future in extractor.progress(
                        concurrent.futures.as_completed(futures),
                        message='Importing images:'):
                    ranks.append(futures[future])

                for rank in extractor.progress(sorted(ranks),
                                               precise=True,
                                               message='Indexing images:'):
                    features_tensor_name = self.image_database.\
                        mount_file_name(
                            rank,
                            'npy'
                        )

                    extractor.save_tensor(
                        extractor.extract,
                        self.image_database.mount_file_name(rank, 'jpg'),
                        features_tensor_name)

                    check_unique = kwargs.get('check_unique')

                    if check_unique is None:
                        check_unique = False

                    index.append(extractor.load_tensor(features_tensor_name),
                                 check_unique=check_unique)

                    os.remove(features_tensor_name)

    def search(self, query, top=4):
        """
        Executes the search for a created database

        Parameters
        ----------
        query: str
            URI of a image to query

        top (optional)(default: 4): int
            How many results should be returned.
        """
        with Extractors(characteristics=self._characteristic) as extractor:
            with Index(extractor.output_shape,
                       data_dir=self._data_dir) as index:
                for result in index.search(extractor.extract(query),
                                           results=top):
                    yield result