def _append_distances(self, v, distance, candidates): """ Apply distance implementation if specified """ if distance: # Normalize vector (stored vectors are normalized) nv = unitvec(v) candidates = [(x[0], x[1], self.distance.distance(x[0], nv)) for x in candidates] return candidates
def test_retrieval_sparse(self): for k in range(100): self.engine.clean_all_buckets() x = scipy.sparse.rand(1000, 1, density=0.05) x_data = 'data' self.engine.store_vector(x, x_data) n = self.engine.neighbours(x) y, y_data, y_distance = n[0] normalized_x = unitvec(x) delta = 0.000000001 self.assertAlmostEqual(numpy.abs((normalized_x - y)).max(), 0, delta=delta) self.assertEqual(y_data, x_data) self.assertAlmostEqual(y_distance, 0.0, delta=delta)
def __init__(self, N, vectors, coverage_ratio=0.2): """ Performs exact nearest neighbour search on the data set. vectors can either be a numpy matrix with all the vectors as columns OR a python array containing the individual numpy vectors. """ # We need a dict from vector string representation to index self.vector_dict = {} self.N = N self.coverage_ratio = coverage_ratio numpy_vectors = numpy_array_from_list_or_numpy_array(vectors) # Get numpy array representation of input self.vectors = numpy.vstack([unitvec(v) for v in numpy_vectors.T]) # Build map from vector string representation to vector for index, v in enumerate(self.vectors): self.vector_dict[self.__vector_to_string(v)] = index # Determine the indices of query vectors used for comparance # with approximated search. query_count = numpy.floor(self.coverage_ratio * len(self.vectors)) self.query_indices = [] for k in range(int(query_count)): index = numpy.floor(k * (float(len(self.vectors)) / query_count)) index = min(index, len(self.vectors) - 1) self.query_indices.append(int(index)) print('\nStarting exact search (query set size=%d)...\n' % query_count) # For each query vector get the closest N neighbours self.closest = {} self.exact_search_time_per_vector = 0.0 for index in self.query_indices: v = self.vectors[index, numpy.newaxis] exact_search_start_time = time.time() D = cdist(v, self.vectors, 'euclidean') self.closest[index] = scipy.argsort(D)[0, 1:N+1] # Save time needed for exact search exact_search_time = time.time() - exact_search_start_time self.exact_search_time_per_vector += exact_search_time print('Done with exact search...\n') # Normalize search time self.exact_search_time_per_vector /= float(len(self.query_indices))
def store_vector(self, v, data=None): """ Hashes vector v and stores it in all matching buckets in the storage. The data argument must be JSON-serializable. It is stored with the vector and will be returned in search results. """ # We will store the normalized vector (used during retrieval) nv = unitvec(v) # Store vector in each bucket of all hashes for lshash in self.lshashes: for bucket_key in lshash.hash_vector(v): #print 'Storying in bucket %s one vector' % bucket_key self.storage.store_vector(lshash.hash_name, bucket_key, nv, data)
def store_many_vectors(self, vs, data=None): """ Store a batch of vectors. Hashes vector vs and stores them in all matching buckets in the storage. The data argument must be either None or a list of JSON-serializable object. It is stored with the vector and will be returned in search results. """ # We will store the normalized vector (used during retrieval) nvs = [unitvec(i) for i in vs] # Store vector in each bucket of all hashes for lshash in self.lshashes: bucket_keys = [lshash.hash_vector(i)[0] for i in vs] self.storage.store_many_vectors(lshash.hash_name, bucket_keys, nvs, data)
def neighbours(self, v): """ Hashes vector v, collects all candidate vectors from the matching buckets in storage, applys the (optional) distance function and finally the (optional) filter function to construct the returned list of either (vector, data, distance) tuples or (vector, data) tuples. """ # Collect candidates from all buckets from all hashes candidates = [] for lshash in self.lshashes: for bucket_key in lshash.hash_vector(v, querying=True): bucket_content = self.storage.get_bucket( lshash.hash_name, bucket_key) #print 'Bucket %s size %d' % (bucket_key, len(bucket_content)) candidates.extend(bucket_content) # print 'Candidate count is %d' % len(candidates) # Apply fetch vector filters if specified and return filtered list if self.fetch_vector_filters: filter_input = candidates for fetch_vector_filter in self.fetch_vector_filters: filter_input = fetch_vector_filter.filter_vectors(filter_input) # Update candidates candidates = filter_input # Apply distance implementation if specified if self.distance: # Normalize vector (stored vectors are normalized) nv = unitvec(v) candidates = [(x[0], x[1], self.distance.distance(x[0], nv)) for x in candidates] # Apply vector filters if specified and return filtered list if self.vector_filters: filter_input = candidates for vector_filter in self.vector_filters: filter_input = vector_filter.filter_vectors(filter_input) # Return output of last filter return filter_input # If there is no vector filter, just return list of candidates return candidates
def neighbours(self, v): """ Hashes vector v, collects all candidate vectors from the matching buckets in storage, applys the (optional) distance function and finally the (optional) filter function to construct the returned list of either (vector, data, distance) tuples or (vector, data) tuples. """ # Collect candidates from all buckets from all hashes candidates = [] for lshash in self.lshashes: for bucket_key in lshash.hash_vector(v, querying=True): bucket_content = self.storage.get_bucket(lshash.hash_name, bucket_key) #print 'Bucket %s size %d' % (bucket_key, len(bucket_content)) candidates.extend(bucket_content) # print 'Candidate count is %d' % len(candidates) # Apply fetch vector filters if specified and return filtered list if self.fetch_vector_filters: filter_input = candidates for fetch_vector_filter in self.fetch_vector_filters: filter_input = fetch_vector_filter.filter_vectors(filter_input) # Update candidates candidates = filter_input # Apply distance implementation if specified if self.distance: # Normalize vector (stored vectors are normalized) nv = unitvec(v) candidates = [(x[0], x[1], self.distance.distance(x[0], nv)) for x in candidates] # Apply vector filters if specified and return filtered list if self.vector_filters: filter_input = candidates for vector_filter in self.vector_filters: filter_input = vector_filter.filter_vectors(filter_input) # Return output of last filter return filter_input # If there is no vector filter, just return list of candidates return candidates
def __vector_to_string(self, vector): """ Returns string representation of vector. """ return numpy.array_str(numpy.round(unitvec(vector), decimals=3))