def test_rand_large(self): n = 1024 for i in xrange(1000): a = gen(n) b = gen(n) actual = bin(a ^ b).count('1') ntools.assert_equal(df.hamming_distance(a, b), actual)
def nn(self, h, n=1): """ Return the nearest `N` neighbors to the given hash code. Distances are in the range [0,1] and are the percent different each neighbor hash is from the query, based on the number of bits contained in the query. :param h: Hash code to compute the neighbors of. Should be the same bit length as indexed hash codes. :type h: numpy.ndarray[bool] :param n: Number of nearest neighbors to find. :type n: int :raises ValueError: No index to query from. :return: Tuple of nearest N hash codes and a tuple of the distance values to those neighbors. :rtype: (tuple[numpy.ndarray[bool], tuple[float]) """ super(LinearHashIndex, self).nn(h, n) h_int = bit_vector_to_int_large(h) bits = len(h) #: :type: list[int|long] near_codes = \ heapq.nsmallest(n, self.index, lambda e: hamming_distance(h_int, e) ) distances = map(hamming_distance, near_codes, [h_int] * len(near_codes)) return [int_to_bit_vector_large(c, bits) for c in near_codes], \ [d / float(bits) for d in distances]
def nn(self, h, n=1): """ Return the nearest `N` neighbors to the given hash code. Distances are in the range [0,1] and are the percent different each neighbor hash is from the query, based on the number of bits contained in the query. :param h: Hash code to compute the neighbors of. Should be the same bit length as indexed hash codes. :type h: numpy.ndarray[bool] :param n: Number of nearest neighbors to find. :type n: int :raises ValueError: No index to query from. :return: Tuple of nearest N hash codes and a tuple of the distance values to those neighbors. :rtype: (tuple[numpy.ndarray[bool], tuple[float]) """ super(LinearHashIndex, self).nn(h, n) h_int = bit_vector_to_int_large(h) bits = len(h) #: :type: list[int|long] near_codes = \ heapq.nsmallest(n, self.index, lambda e: hamming_distance(h_int, e) ) distances = map(hamming_distance, near_codes, [h_int] * len(near_codes)) return [int_to_bit_vector_large(c, bits) for c in near_codes], \ [d / float(bits) for d in distances]
def nn(self, d, n=1): """ Return the nearest `N` neighbors to the given descriptor element. :param d: Descriptor element to compute the neighbors of. :type d: smqtk.representation.DescriptorElement :param n: Number of nearest neighbors to find. :type n: int :return: Tuple of nearest N DescriptorElement instances, and a tuple of the distance values to those neighbors. :rtype: (tuple[smqtk.representation.DescriptorElement], tuple[float]) """ d_vec, _, d_sc = self.get_small_code(d) # Extract the `n` nearest codes to the code of the query descriptor # - a code may associate with multiple hits, but its a safe assumption # that if we get the top `n` codes, which exist because there is at # least one element in association with it, self._log.debug("fetching nearest %d codes", n) code_set = self._code_index.codes() # TODO: Optimize this step #: :type: list[int] near_codes = \ heapq.nsmallest(n, code_set, lambda e: distance_functions.hamming_distance(d_sc, e) ) # Collect descriptors from subsequently farther away bins until we have # >= `n` descriptors, which we will more finely sort after this. #: :type: list[smqtk.representation.DescriptorElement] self._log.debug("Collecting descriptors from near codes") neighbors = [] termination_count = min(n, self.count()) for nc in near_codes: neighbors.extend(self._code_index.get_descriptors(nc)) # Break out if we've collected >= `n` descriptors, as descriptors # from more distance codes are likely to not be any closer. if len(neighbors) >= termination_count: break # Compute fine-grain distance measurements for collected elements + sort self._log.debug("elements to numpy") neighbor_vectors = elements_to_matrix(neighbors, use_multiprocessing=False, report_interval=1) self._log.debug("Sorting descriptors: %d", len(neighbors)) def comp_neighbor_dist(neighbor_vec): return self._dist_func(d_vec, neighbor_vec) distances = map(comp_neighbor_dist, neighbor_vectors) # Sort by distance, return top n self._log.debug("Forming output") ordered = sorted(zip(neighbors, distances), key=lambda p: p[1]) neighbors, distances = zip(*(ordered[:n])) return neighbors, distances
def test_hd_0(self): ntools.assert_equal(df.hamming_distance(0, 0), 0)