Exemplo n.º 1
0
 def test_nn(self) -> None:
     i = LinearHashIndex()
     # noinspection PyTypeChecker
     i.build_index([[0, 1, 0], [1, 1, 0], [0, 1, 1], [0, 0, 1]])
     # noinspection PyTypeChecker
     near_codes, near_dists = i.nn([0, 0, 0], 4)
     self.assertEqual(set(map(tuple, near_codes[:2])), {(0, 1, 0),
                                                        (0, 0, 1)})
     self.assertEqual(set(map(tuple, near_codes[2:])), {(1, 1, 0),
                                                        (0, 1, 1)})
     numpy.testing.assert_array_almost_equal(
         near_dists, (1 / 3., 1 / 3., 2 / 3., 2 / 3.))
Exemplo n.º 2
0
    def _nn(
        self,
        d: DescriptorElement,
        n: int = 1
    ) -> Tuple[Tuple[DescriptorElement, ...], Tuple[float, ...]]:
        """
        Internal method to be implemented by sub-classes to return the nearest
        `N` neighbors to the given descriptor element.

        When this internal method is called, we have already checked that there
        is a vector in ``d`` and our index is not empty.

        :param d: Descriptor element to compute the neighbors of.
        :param n: Number of nearest neighbors to find.

        :return: Tuple of nearest N DescriptorElement instances, and a tuple of
            the distance values to those neighbors.

        """
        LOG.debug("generating hash for descriptor")
        d_v = d.vector()
        d_h = self.lsh_functor.get_hash(d_v)

        def comp_descr_dist(d2_v: numpy.ndarray) -> float:
            return self._distance_function(d_v, d2_v)

        with self._model_lock:
            LOG.debug("getting near hashes")
            hi = self.hash_index
            if hi is None:
                # Make on-the-fly linear index
                hi = LinearHashIndex()
                # not calling ``build_index`` because we already have the int
                # hashes.
                hi.index = set(cast(Iterator[int], self.hash2uuids_kvstore.keys()))
            near_hashes, _ = hi.nn(d_h, n)

            LOG.debug("getting UUIDs of descriptors for nearby hashes")
            neighbor_uuids: List[Hashable] = []
            for h_int in map(bit_vector_to_int_large, near_hashes):
                # If descriptor hash not in our map, we effectively skip it.
                # Get set of descriptor UUIDs for a hash code.
                near_uuids: Set[Hashable] = self.hash2uuids_kvstore.get(h_int, set())
                # Accumulate matching descriptor UUIDs to a list.
                neighbor_uuids.extend(near_uuids)
            LOG.debug("-- matched %d UUIDs", len(neighbor_uuids))

            LOG.debug("getting descriptors for neighbor_uuids")
            neighbors = \
                list(self.descriptor_set.get_many_descriptors(neighbor_uuids))

        # Done with model parts at this point, so releasing lock.

        LOG.debug(f"ordering descriptors via distance method {self.distance_method}")
        LOG.debug('-- getting element vectors')
        neighbor_vectors = numpy.asarray(list(
            parallel_map(lambda d_: d_.vector(), neighbors)
        ))
        LOG.debug('-- calculating distances')
        distances = list(map(comp_descr_dist, neighbor_vectors))
        LOG.debug('-- ordering')
        ordered = sorted(zip(neighbors, distances),
                         key=lambda p: p[1])
        LOG.debug(f'-- slicing top n={n}')
        r_descrs: Tuple[DescriptorElement, ...]
        r_dists: Tuple[float, ...]
        r_descrs, r_dists = zip(*(ordered[:n]))
        return r_descrs, r_dists