Пример #1
0
    def nn(self, h, n=1):
        """
        Return the nearest `N` neighbors to the given hash code.

        Distances are in the range [0,1] and are the percent different each
        neighbor hash is from the query, based on the number of bits contained
        in the query.

        :param h: Hash code to compute the neighbors of. Should be the same bit
            length as indexed hash codes.
        :type h: numpy.ndarray[bool] | list[bool]

        :param n: Number of nearest neighbors to find.
        :type n: int

        :raises ValueError: No index to query from.

        :return: Tuple of nearest N hash codes and a tuple of the distance
            values to those neighbors.
        :rtype: (tuple[numpy.ndarray[bool], tuple[float])

        """
        super(LinearHashIndex, self).nn(h, n)

        h_int = bit_vector_to_int_large(h)
        bits = len(h)
        #: :type: list[int|long]
        near_codes = \
            heapq.nsmallest(n, self.index,
                            lambda e: hamming_distance(h_int, e)
                            )
        distances = list(map(hamming_distance, near_codes,
                             [h_int] * len(near_codes)))
        return [int_to_bit_vector_large(c, bits) for c in near_codes], \
               [d / float(bits) for d in distances]
Пример #2
0
    def nn(self, h, n=1):
        """
        Return the nearest `N` neighbors to the given hash code.

        Distances are in the range [0,1] and are the percent different each
        neighbor hash is from the query, based on the number of bits contained
        in the query.

        :param h: Hash code to compute the neighbors of. Should be the same bit
            length as indexed hash codes.
        :type h: numpy.ndarray[bool]

        :param n: Number of nearest neighbors to find.
        :type n: int

        :raises ValueError: No index to query from.

        :return: Tuple of nearest N hash codes and a tuple of the distance
            values to those neighbors.
        :rtype: (tuple[numpy.ndarray[bool], tuple[float])

        """
        super(LinearHashIndex, self).nn(h, n)

        h_int = bit_vector_to_int_large(h)
        bits = len(h)
        #: :type: list[int|long]
        near_codes = \
            heapq.nsmallest(n, self.index,
                            lambda e: hamming_distance(h_int, e)
                            )
        distances = map(hamming_distance, near_codes,
                        [h_int] * len(near_codes))
        return [int_to_bit_vector_large(c, bits) for c in near_codes], \
               [d / float(bits) for d in distances]
Пример #3
0
    def _remove_from_index(self, uids):
        """
        Remove descriptors from this index associated with the given UIDs.

        :param uids: Iterable of UIDs of descriptors to remove from this index.
        :type uids: collections.Iterable[collections.Hashable]

        :raises KeyError: One or more UIDs provided do not match any stored
            descriptors.  The index should not be modified.
        :raises ReadOnlyError: This index is set to be read-only and cannot be
            modified.

        """
        with self._model_lock:
            if self.read_only:
                raise ReadOnlyError("Cannot modify container attributes due "
                                    "to being in read-only mode.")

            uids = list(uids)

            # Remove UIDs from our hash2uid-kvs
            # - get the hash for each input UID's descriptor, remove UID from
            #   recorded association set.
            # - `get_many_descriptors` fails when bad UIDs are provided
            #   (KeyError).
            self._log.debug("Removing hash2uid entries for UID's descriptors")
            h_vectors = collections.deque()
            h_ints = collections.deque()
            for d in self.descriptor_index.get_many_descriptors(uids):
                h_vec = self.lsh_functor.get_hash(d.vector())
                h_vectors.append(h_vec)
                h_int = bit_vector_to_int_large(h_vec)
                h_ints.append(h_int)

            # If we're here, then all given UIDs mapped to an indexed
            # descriptor.  Proceed with removal from hash2uids kvs.  If a hash
            # no longer maps anything, remove that hash from the hash index if
            # we have one.
            hashes_for_removal = collections.deque()
            for uid, h_int, h_vec in zip(uids, h_ints, h_vectors):
                # noinspection PyUnresolvedReferences
                new_uid_set = self.hash2uuids_kvstore.get(h_int) - {uid}
                # If the resolved UID set is not empty re-add it, otherwise
                # remove the
                if new_uid_set:
                    self.hash2uuids_kvstore.add(h_int, new_uid_set)
                else:
                    hashes_for_removal.append(h_vec)
                    self.hash2uuids_kvstore.remove(h_int)

            # call remove-from-index on hash-index if we have one and there are
            # hashes to be removed.
            if self.hash_index and hashes_for_removal:
                self.hash_index.remove_from_index(hashes_for_removal)

            # Remove descriptors from our set matching the given UIDs.
            self.descriptor_index.remove_many_descriptors(uids)
Пример #4
0
    def _update_index(self, descriptors):
        """
        Internal method to be implemented by sub-classes to additively update
        the current index with the one or more descriptor elements given.

        If no index exists yet, a new one should be created using the given
        descriptors.

        :raises ReadOnlyError: This index is set to be read-only and cannot be
            modified.

        :param descriptors: Iterable of descriptor elements to add to this
            index.
        :type descriptors:
            collections.Iterable[smqtk.representation.DescriptorElement]

        """
        with self._model_lock:
            if self.read_only:
                raise ReadOnlyError("Cannot modify container attributes due "
                                    "to being in read-only mode.")
            # tee out iterable for use in adding to index as well as hash code
            # generation.
            d_for_index, d_for_hashing = itertools.tee(descriptors, 2)

            self._log.debug("Updating descriptor index.")
            self.descriptor_index.add_many_descriptors(d_for_index)

            self._log.debug("Generating hash codes for new descriptors")
            prog_reporter = ProgressReporter(self._log.debug, 1.0).start()
            #: :type: collections.deque[numpy.ndarray[bool]]
            hash_vectors = collections.deque()  # for updating hash_index
            # for updating kv-store after collecting new hash codes
            kvstore_update = {}
            for d in d_for_hashing:
                h_vec = self.lsh_functor.get_hash(d.vector())
                hash_vectors.append(h_vec)
                h_int = bit_vector_to_int_large(h_vec)
                # Get, update and reinsert hash UUID set object.
                if h_int not in kvstore_update:
                    #: :type: set
                    kvstore_update[h_int] = \
                        self.hash2uuids_kvstore.get(h_int, set())
                kvstore_update[h_int] |= {d.uuid()}
                prog_reporter.increment_report()
            prog_reporter.report()

            self._log.debug("Updating kv-store with new hash codes")
            self.hash2uuids_kvstore.add_many(kvstore_update)
            del kvstore_update

            if self.hash_index is not None:
                self._log.debug("Updating hash index structure.")
                self.hash_index.update_index(hash_vectors)
Пример #5
0
    def _update_index(self, descriptors):
        """
        Internal method to be implemented by sub-classes to additively update
        the current index with the one or more descriptor elements given.

        If no index exists yet, a new one should be created using the given
        descriptors.

        :raises ReadOnlyError: This index is set to be read-only and cannot be
            modified.

        :param descriptors: Iterable of descriptor elements to add to this
            index.
        :type descriptors:
            collections.Iterable[smqtk.representation.DescriptorElement]

        """
        with self._model_lock:
            if self.read_only:
                raise ReadOnlyError("Cannot modify container attributes due "
                                    "to being in read-only mode.")
            # tee out iterable for use in adding to index as well as hash code
            # generation.
            d_for_index, d_for_hashing = itertools.tee(descriptors, 2)

            self._log.debug("Updating descriptor index.")
            self.descriptor_index.add_many_descriptors(d_for_index)

            self._log.debug("Generating hash codes for new descriptors")
            prog_reporter = ProgressReporter(self._log.debug, 1.0).start()
            #: :type: collections.deque[numpy.ndarray[bool]]
            hash_vectors = collections.deque()  # for updating hash_index
            # for updating kv-store after collecting new hash codes
            kvstore_update = {}
            for d in d_for_hashing:
                h_vec = self.lsh_functor.get_hash(d.vector())
                hash_vectors.append(h_vec)
                h_int = bit_vector_to_int_large(h_vec)
                # Get, update and reinsert hash UUID set object.
                if h_int not in kvstore_update:
                    #: :type: set
                    kvstore_update[h_int] = \
                        self.hash2uuids_kvstore.get(h_int, set())
                kvstore_update[h_int] |= {d.uuid()}
                prog_reporter.increment_report()
            prog_reporter.report()

            self._log.debug("Updating kv-store with new hash codes")
            self.hash2uuids_kvstore.add_many(kvstore_update)
            del kvstore_update

            if self.hash_index is not None:
                self._log.debug("Updating hash index structure.")
                self.hash_index.update_index(hash_vectors)
Пример #6
0
    def _build_index(self, descriptors):
        """
        Internal method to be implemented by sub-classes to build the index with
        the given descriptor data elements.

        Subsequent calls to this method should rebuild the current index.  This
        method shall not add to the existing index nor raise an exception to as
        to protect the current index.

        :raises ReadOnlyError: This index is set to be read-only and cannot be
            modified.

        :param descriptors: Iterable of descriptor elements to build index
            over.
        :type descriptors:
            collections.Iterable[smqtk.representation.DescriptorElement]

        """
        with self._model_lock:
            if self.read_only:
                raise ReadOnlyError(
                    "Cannot modify container attributes due to "
                    "being in read-only mode.")

            self._log.debug("Clearing and adding new descriptor elements")
            self.descriptor_index.clear()
            self.descriptor_index.add_many_descriptors(descriptors)

            self._log.debug("Generating hash codes")
            #: :type: collections.deque[numpy.ndarray[bool]]
            hash_vectors = collections.deque()
            self.hash2uuids_kvstore.clear()
            prog_reporter = ProgressReporter(self._log.debug, 1.0).start()
            for d in self.descriptor_index:
                h_vec = self.lsh_functor.get_hash(d.vector())
                hash_vectors.append(h_vec)

                h_int = bit_vector_to_int_large(h_vec)

                # Get, update and reinsert hash UUID set object
                #: :type: set
                hash_uuid_set = self.hash2uuids_kvstore.get(h_int, set())
                hash_uuid_set.add(d.uuid())
                self.hash2uuids_kvstore.add(h_int, hash_uuid_set)

                prog_reporter.increment_report()
            prog_reporter.report()

            if self.hash_index is not None:
                self._log.debug("Clearing and building hash index of type %s",
                                type(self.hash_index))
                # a build is supposed to clear previous state.
                self.hash_index.build_index(hash_vectors)
Пример #7
0
    def build_index(self, descriptors):
        """
        Build the index over the descriptor data elements. This in turn builds
        the configured hash index if one is set.

        Subsequent calls to this method should rebuild the index, not add to
        it, or raise an exception to as to protect the current index. Rebuilding
        the LSH index involves clearing the set descriptor index, key-value
        store and, if set, the hash index.

        :raises ValueError: No data available in the given iterable.

        :param descriptors: Iterable of descriptor elements to build index
            over.
        :type descriptors:
            collections.Iterable[smqtk.representation.DescriptorElement]

        """
        if self.read_only:
            raise ReadOnlyError("Cannot modify container attributes due to "
                                "being in read-only mode.")

        self._log.debug("Clearing and adding new descriptor elements")
        self.descriptor_index.clear()
        self.descriptor_index.add_many_descriptors(descriptors)

        self._log.debug("Generating hash codes")
        state = [0] * 7
        hash_vectors = collections.deque()
        self.hash2uuids_kvstore.clear()
        for d in self.descriptor_index:
            h = self.lsh_functor.get_hash(d.vector())
            hash_vectors.append(h)

            h_int = bit_vector_to_int_large(h)

            # Get, update and reinsert hash UUID set object
            #: :type: set
            hash_uuid_set = self.hash2uuids_kvstore.get(h_int, set())
            hash_uuid_set.add(d.uuid())
            self.hash2uuids_kvstore.add(h_int, hash_uuid_set)

            report_progress(self._log.debug, state, 1.0)
        state[1] -= 1
        report_progress(self._log.debug, state, 0)

        if self.hash_index is not None:
            self._log.debug("Clearing and building hash index of type %s",
                            type(self.hash_index))
            # a build is supposed to clear previous state.
            self.hash_index.build_index(hash_vectors)
Пример #8
0
    def _build_index(self, descriptors):
        """
        Internal method to be implemented by sub-classes to build the index with
        the given descriptor data elements.

        Subsequent calls to this method should rebuild the current index.  This
        method shall not add to the existing index nor raise an exception to as
        to protect the current index.

        :raises ReadOnlyError: This index is set to be read-only and cannot be
            modified.

        :param descriptors: Iterable of descriptor elements to build index
            over.
        :type descriptors:
            collections.Iterable[smqtk.representation.DescriptorElement]

        """
        with self._model_lock:
            if self.read_only:
                raise ReadOnlyError("Cannot modify container attributes due to "
                                    "being in read-only mode.")

            self._log.debug("Clearing and adding new descriptor elements")
            self.descriptor_index.clear()
            self.descriptor_index.add_many_descriptors(descriptors)

            self._log.debug("Generating hash codes")
            #: :type: collections.deque[numpy.ndarray[bool]]
            hash_vectors = collections.deque()
            self.hash2uuids_kvstore.clear()
            prog_reporter = ProgressReporter(self._log.debug, 1.0).start()
            # We just cleared the previous store, so aggregate new kv-mapping
            # in ``kvstore_update`` for single update after loop.
            kvstore_update = collections.defaultdict(set)
            for d in self.descriptor_index:
                h_vec = self.lsh_functor.get_hash(d.vector())
                hash_vectors.append(h_vec)
                h_int = bit_vector_to_int_large(h_vec)
                kvstore_update[h_int] |= {d.uuid()}
                prog_reporter.increment_report()
            prog_reporter.report()
            self.hash2uuids_kvstore.add_many(kvstore_update)
            del kvstore_update

            if self.hash_index is not None:
                self._log.debug("Clearing and building hash index of type %s",
                                type(self.hash_index))
                # a build is supposed to clear previous state.
                self.hash_index.build_index(hash_vectors)
Пример #9
0
        def iter_add_hashes():
            """
            Helper to generate hash codes for descriptors as well as add to map
            """
            l = s = time.time()
            for d in descriptor_index.iterdescriptors():
                h = hash_functor.get_hash(d.vector())
                h_int = bit_vector_to_int_large(h)
                if h_int not in hash2uuid:
                    yield h
                    hash2uuid[h_int] = set()

                    t = time.time()
                    if t - l >= 1.0:
                        n = len(hash2uuid)
                        cls.logger().debug(
                            "yielding %f hashes per second "
                            "(%d of %d total)", n / (t - s), n,
                            descriptor_index.count())
                        l = t

                hash2uuid[h_int].add(d.uuid())
Пример #10
0
        def iter_add_hashes():
            """
            Helper to generate hash codes for descriptors as well as add to map
            """
            l = s = time.time()
            for d in descriptor_index.iterdescriptors():
                h = hash_functor.get_hash(d.vector())
                h_int = bit_vector_to_int_large(h)
                if h_int not in hash2uuid:
                    yield h
                    hash2uuid[h_int] = set()

                    t = time.time()
                    if t - l >= 1.0:
                        n = len(hash2uuid)
                        cls.logger().debug("yielding %f hashes per second "
                                           "(%d of %d total)",
                                           n / (t - s), n,
                                           descriptor_index.count())
                        l = t

                hash2uuid[h_int].add(d.uuid())
Пример #11
0
    def _nn(self, h, n=1):
        """
        Internal method to be implemented by sub-classes to return the nearest
        `N` neighbor hash codes as bit-vectors to the given hash code
        bit-vector.

        Distances are in the range [0,1] and are the percent different each
        neighbor hash is from the query, based on the number of bits contained
        in the query (normalized hamming distance).

        When this internal method is called, we have already checked that our
        index is not empty.

        :param h: Hash code to compute the neighbors of. Should be the same bit
            length as indexed hash codes.
        :type h: numpy.ndarray[bool]

        :param n: Number of nearest neighbors to find.
        :type n: int

        :return: Tuple of nearest N hash codes and a tuple of the distance
            values to those neighbors.
        :rtype: (tuple[numpy.ndarray[bool]], tuple[float])

        """
        with self._model_lock:
            h_int = bit_vector_to_int_large(h)
            bits = len(h)
            #: :type: list[int|long]
            near_codes = \
                heapq.nsmallest(n, self.index,
                                lambda e: hamming_distance(h_int, e)
                                )
            distances = map(hamming_distance, near_codes,
                            [h_int] * len(near_codes))
            return [int_to_bit_vector_large(c, bits) for c in near_codes], \
                   [d / float(bits) for d in distances]
Пример #12
0
    def _nn(self, h, n=1):
        """
        Internal method to be implemented by sub-classes to return the nearest
        `N` neighbor hash codes as bit-vectors to the given hash code
        bit-vector.

        Distances are in the range [0,1] and are the percent different each
        neighbor hash is from the query, based on the number of bits contained
        in the query (normalized hamming distance).

        When this internal method is called, we have already checked that our
        index is not empty.

        :param h: Hash code to compute the neighbors of. Should be the same bit
            length as indexed hash codes.
        :type h: numpy.ndarray[bool]

        :param n: Number of nearest neighbors to find.
        :type n: int

        :return: Tuple of nearest N hash codes and a tuple of the distance
            values to those neighbors.
        :rtype: (tuple[numpy.ndarray[bool]], tuple[float])

        """
        with self._model_lock:
            h_int = bit_vector_to_int_large(h)
            bits = len(h)
            #: :type: list[int|long]
            near_codes = \
                heapq.nsmallest(n, self.index,
                                lambda e: hamming_distance(h_int, e)
                                )
            distances = map(hamming_distance, near_codes,
                            [h_int] * len(near_codes))
            return [int_to_bit_vector_large(c, bits) for c in near_codes], \
                   [d / float(bits) for d in distances]
Пример #13
0
 def get_hash(u):
     v = index.get_descriptor(u).vector()
     return u, bit_utils.bit_vector_to_int_large(functor.get_hash(v))
Пример #14
0
    def _remove_from_index(self, uids):
        """
        Remove descriptors from this index associated with the given UIDs.

        :param uids: Iterable of UIDs of descriptors to remove from this index.
        :type uids: collections.Iterable[collections.Hashable]

        :raises KeyError: One or more UIDs provided do not match any stored
            descriptors.  The index should not be modified.
        :raises ReadOnlyError: This index is set to be read-only and cannot be
            modified.

        """
        with self._model_lock:
            if self.read_only:
                raise ReadOnlyError("Cannot modify container attributes due "
                                    "to being in read-only mode.")

            uids = list(uids)

            # Remove UIDs from our hash2uid-kvs
            # - get the hash for each input UID's descriptor, remove UID from
            #   recorded association set.
            # - `get_many_descriptors` fails when bad UIDs are provided
            #   (KeyError).
            self._log.debug("Removing hash2uid entries for UID's descriptors")
            h_vectors = collections.deque()
            h_ints = collections.deque()
            for d in self.descriptor_index.get_many_descriptors(uids):
                h_vec = self.lsh_functor.get_hash(d.vector())
                h_vectors.append(h_vec)
                h_int = bit_vector_to_int_large(h_vec)
                h_ints.append(h_int)

            # If we're here, then all given UIDs mapped to an indexed
            # descriptor.  Proceed with removal from hash2uids kvs.  If a hash
            # no longer maps anything, remove that key from the KVS.
            hashes_for_removal = collections.deque()
            # store key-value pairs to update after loop in batch call
            kvs_update = {}
            # store keys to remove after loop in batch-call
            kvs_remove = set()
            for uid, h_int, h_vec in zip(uids, h_ints, h_vectors):
                if h_int not in kvs_update:
                    # First time seeing key, cache current value
                    kvs_update[h_int] = \
                        self.hash2uuids_kvstore.get(h_int, set())
                kvs_update[h_int] -= {uid}
                # If the resolves UID set is empty, flag the key for removal.
                if not kvs_update[h_int]:
                    del kvs_update[h_int]
                    kvs_remove.add(h_int)
                    hashes_for_removal.append(h_vec)
            self._log.debug("Updating hash2uuids: modified relations")
            self.hash2uuids_kvstore.add_many(kvs_update)
            self._log.debug("Updating hash2uuids: removing empty hash keys")
            self.hash2uuids_kvstore.remove_many(kvs_remove)
            del kvs_update, kvs_remove

            # call remove-from-index on hash-index if we have one and there are
            # hashes to be removed.
            if self.hash_index and hashes_for_removal:
                self.hash_index.remove_from_index(hashes_for_removal)

            # Remove descriptors from our set matching the given UIDs.
            self.descriptor_index.remove_many_descriptors(uids)
Пример #15
0
 def get_hash(u):
     v = index.get_descriptor(u).vector()
     return u, bit_utils.bit_vector_to_int_large(functor.get_hash(v))
Пример #16
0
    def _remove_from_index(self, uids):
        """
        Remove descriptors from this index associated with the given UIDs.

        :param uids: Iterable of UIDs of descriptors to remove from this index.
        :type uids: collections.Iterable[collections.Hashable]

        :raises KeyError: One or more UIDs provided do not match any stored
            descriptors.  The index should not be modified.
        :raises ReadOnlyError: This index is set to be read-only and cannot be
            modified.

        """
        with self._model_lock:
            if self.read_only:
                raise ReadOnlyError("Cannot modify container attributes due "
                                    "to being in read-only mode.")

            uids = list(uids)

            # Remove UIDs from our hash2uid-kvs
            # - get the hash for each input UID's descriptor, remove UID from
            #   recorded association set.
            # - `get_many_descriptors` fails when bad UIDs are provided
            #   (KeyError).
            self._log.debug("Removing hash2uid entries for UID's descriptors")
            h_vectors = collections.deque()
            h_ints = collections.deque()
            for d in self.descriptor_index.get_many_descriptors(uids):
                h_vec = self.lsh_functor.get_hash(d.vector())
                h_vectors.append(h_vec)
                h_int = bit_vector_to_int_large(h_vec)
                h_ints.append(h_int)

            # If we're here, then all given UIDs mapped to an indexed
            # descriptor.  Proceed with removal from hash2uids kvs.  If a hash
            # no longer maps anything, remove that key from the KVS.
            hashes_for_removal = collections.deque()
            # store key-value pairs to update after loop in batch call
            kvs_update = {}
            # store keys to remove after loop in batch-call
            kvs_remove = set()
            for uid, h_int, h_vec in zip(uids, h_ints, h_vectors):
                if h_int not in kvs_update:
                    # First time seeing key, cache current value
                    kvs_update[h_int] = \
                        self.hash2uuids_kvstore.get(h_int, set())
                kvs_update[h_int] -= {uid}
                # If the resolves UID set is empty, flag the key for removal.
                if not kvs_update[h_int]:
                    del kvs_update[h_int]
                    kvs_remove.add(h_int)
                    hashes_for_removal.append(h_vec)
            self._log.debug("Updating hash2uuids: modified relations")
            self.hash2uuids_kvstore.add_many(kvs_update)
            self._log.debug("Updating hash2uuids: removing empty hash keys")
            self.hash2uuids_kvstore.remove_many(kvs_remove)
            del kvs_update, kvs_remove

            # call remove-from-index on hash-index if we have one and there are
            # hashes to be removed.
            if self.hash_index and hashes_for_removal:
                self.hash_index.remove_from_index(hashes_for_removal)

            # Remove descriptors from our set matching the given UIDs.
            self.descriptor_index.remove_many_descriptors(uids)