Пример #1
0
 def test_rand_large(self):
     n = 1024
     for i in xrange(1000):
         a = gen(n)
         b = gen(n)
         actual = bin(a ^ b).count('1')
         ntools.assert_equal(df.hamming_distance(a, b), actual)
Пример #2
0
    def nn(self, h, n=1):
        """
        Return the nearest `N` neighbors to the given hash code.

        Distances are in the range [0,1] and are the percent different each
        neighbor hash is from the query, based on the number of bits contained
        in the query.

        :param h: Hash code to compute the neighbors of. Should be the same bit
            length as indexed hash codes.
        :type h: numpy.ndarray[bool]

        :param n: Number of nearest neighbors to find.
        :type n: int

        :raises ValueError: No index to query from.

        :return: Tuple of nearest N hash codes and a tuple of the distance
            values to those neighbors.
        :rtype: (tuple[numpy.ndarray[bool], tuple[float])

        """
        super(LinearHashIndex, self).nn(h, n)

        h_int = bit_vector_to_int_large(h)
        bits = len(h)
        #: :type: list[int|long]
        near_codes = \
            heapq.nsmallest(n, self.index,
                            lambda e: hamming_distance(h_int, e)
                            )
        distances = map(hamming_distance, near_codes,
                        [h_int] * len(near_codes))
        return [int_to_bit_vector_large(c, bits) for c in near_codes], \
               [d / float(bits) for d in distances]
Пример #3
0
    def nn(self, h, n=1):
        """
        Return the nearest `N` neighbors to the given hash code.

        Distances are in the range [0,1] and are the percent different each
        neighbor hash is from the query, based on the number of bits contained
        in the query.

        :param h: Hash code to compute the neighbors of. Should be the same bit
            length as indexed hash codes.
        :type h: numpy.ndarray[bool]

        :param n: Number of nearest neighbors to find.
        :type n: int

        :raises ValueError: No index to query from.

        :return: Tuple of nearest N hash codes and a tuple of the distance
            values to those neighbors.
        :rtype: (tuple[numpy.ndarray[bool], tuple[float])

        """
        super(LinearHashIndex, self).nn(h, n)

        h_int = bit_vector_to_int_large(h)
        bits = len(h)
        #: :type: list[int|long]
        near_codes = \
            heapq.nsmallest(n, self.index,
                            lambda e: hamming_distance(h_int, e)
                            )
        distances = map(hamming_distance, near_codes,
                        [h_int] * len(near_codes))
        return [int_to_bit_vector_large(c, bits) for c in near_codes], \
               [d / float(bits) for d in distances]
Пример #4
0
    def nn(self, d, n=1):
        """
        Return the nearest `N` neighbors to the given descriptor element.

        :param d: Descriptor element to compute the neighbors of.
        :type d: smqtk.representation.DescriptorElement

        :param n: Number of nearest neighbors to find.
        :type n: int

        :return: Tuple of nearest N DescriptorElement instances, and a tuple of
            the distance values to those neighbors.
        :rtype: (tuple[smqtk.representation.DescriptorElement], tuple[float])

        """
        d_vec, _, d_sc = self.get_small_code(d)

        # Extract the `n` nearest codes to the code of the query descriptor
        # - a code may associate with multiple hits, but its a safe assumption
        #   that if we get the top `n` codes, which exist because there is at
        #   least one element in association with it,
        self._log.debug("fetching nearest %d codes", n)
        code_set = self._code_index.codes()
        # TODO: Optimize this step
        #: :type: list[int]
        near_codes = \
            heapq.nsmallest(n, code_set,
                            lambda e:
                                distance_functions.hamming_distance(d_sc, e)
                            )

        # Collect descriptors from subsequently farther away bins until we have
        # >= `n` descriptors, which we will more finely sort after this.
        #: :type: list[smqtk.representation.DescriptorElement]
        self._log.debug("Collecting descriptors from near codes")
        neighbors = []
        termination_count = min(n, self.count())
        for nc in near_codes:
            neighbors.extend(self._code_index.get_descriptors(nc))
            # Break out if we've collected >= `n` descriptors, as descriptors
            # from more distance codes are likely to not be any closer.
            if len(neighbors) >= termination_count:
                break

        # Compute fine-grain distance measurements for collected elements + sort
        self._log.debug("elements to numpy")
        neighbor_vectors = elements_to_matrix(neighbors,
                                              use_multiprocessing=False,
                                              report_interval=1)
        self._log.debug("Sorting descriptors: %d", len(neighbors))
        def comp_neighbor_dist(neighbor_vec):
            return self._dist_func(d_vec, neighbor_vec)
        distances = map(comp_neighbor_dist, neighbor_vectors)

        # Sort by distance, return top n
        self._log.debug("Forming output")
        ordered = sorted(zip(neighbors, distances), key=lambda p: p[1])
        neighbors, distances = zip(*(ordered[:n]))
        return neighbors, distances
Пример #5
0
 def test_hd_0(self):
     ntools.assert_equal(df.hamming_distance(0, 0), 0)