def text_hist(): """ Calculate histogram of text of images """ with open('data/sift_names.pkl', 'r') as f: names = cPickle.load(f) with open('data/sift_hist.pkl', 'r') as f: sift_hists = cPickle.load(f) filenames = [] for name in names: name = name.replace('img', 'descr') name = name.replace('.jpg', '.txt') filenames.append('shopping/images/' + name) vectorizer = CountVectorizer(input='filename', token_pattern="(?u)"+'\w+', ngram_range=(1, 1), min_df=2) xall_transformed = vectorizer.fit_transform(filenames).tocsr() preprocessing.normalize(xall_transformed, copy=False) lamb = .5 hists = scipy.sparse.hstack([xall_transformed * lamb, sift_hists * (1-lamb)]).toarray() preprocessing.normalize(hists, copy=False) model = LSHForest() model.fit(hists) with open('data/text_hist.pkl', 'w') as f: cPickle.dump(xall_transformed, f) with open('data/vectorizer.pkl', 'w') as f: cPickle.dump(vectorizer, f) with open('data/lshforest_combine.pkl', 'w') as f: cPickle.dump(model, f)
def fit_lsh(self): self.lsh = LSHForest(random_state=12345) train_data = [ self.encode_sentence(self.indexed_background[i], True) for i in range(len(self.indexed_background)) ] self.lsh.fit(train_data)
def knn_indices_func_approx( rep_pts: FloatTensor, # (N, pts, dim) pts: FloatTensor, # (N, x, dim) K: int, D: int) -> LongTensor: # (N, pts, K) """ Approximate CPU-based Indexing function based on K-Nearest Neighbors search. :param rep_pts: Representative points. :param pts: Point cloud to get indices from. :param K: Number of nearest neighbors to collect. :param D: "Spread" of neighboring points. :return: Array of indices, P_idx, into pts such that pts[n][P_idx[n],:] is the set k-nearest neighbors for the representative points in pts[n]. """ if rep_pts.is_cuda: rep_pts = rep_pts.cpu() if pts.is_cuda: pts = pts.cpu() rep_pts = rep_pts.data.numpy() pts = pts.data.numpy() region_idx = [] for n, p in enumerate(rep_pts): P_particular = pts[n] lshf = LSHForest(n_estimators=20, n_candidates=100, n_neighbors=D * K + 1) lshf.fit(P_particular) indices = lshf.kneighbors(p, return_distance=False) region_idx.append(indices[:, 1::D])
class EmbeddingNetworkBuilder: """ Basically a wrapper around sklearns LSH forest """ def __init__(self, lsh_init=None): if lsh_init == None: self._lsh_forest = LSHForest(n_estimators=25, n_candidates=1000) else: self._lsh_forest = lsh_init self.iw = None self.m = None def fit_lsh_forest(self, embedding): self._lsh_forest.fit(embedding.m) self._embedding = embedding def extract_nn_network(self, nn=20): dir_graph_mat = self._lsh_forest.kneighbors_graph(X=self._embedding.m, n_neighbors=nn + 1) return dir_graph_mat def make_undirected(self, dir_graph_mat): nodes = set(range(dir_graph_mat.shape[0])) edges = set([]) for node_i in dir_graph_mat.shape[0]: for node_j in dir_graph_mat[node_i].nonzero()[1]: edges.add((node_i, node_j)) return nodes, edges def get_forest(self): return self._lsh_forest def get_node_to_word(self): return self.iw
def lof(X, k, outlier_threshold=1.5, verbose=False): """Knn with KD trees""" start = time.time() lshf = LSHForest(random_state=42) lshf.fit(X) distance, index= lshf.kneighbors(X,n_neighbors=k) distance, index = distance[:, 1:], index[:, 1:] radius = distance[:, -1] """Calculate LRD.""" LRD = np.mean(np.maximum(distance, radius[index]), axis=1) r = 1. / np.array(LRD) """Calculate outlier score.""" outlier_score = np.sum(r[index], axis=1) / np.array(r, dtype=np.float16) outlier_score *= 1. / k # print ('Compute time: %g seconds.' % ((time.time() - start))) if verbose: print("Recording all outliers with outlier score greater than %s." \ % (outlier_threshold)) outliers = [] """ Could parallelize this for loop, but really not worth the overhead... Would get insignificant performance gain.""" for i, score in enumerate(outlier_score): if score > outlier_threshold: outliers.append([i,X[i], score]) if verbose: print("Detected outliers:") print(outliers) return outliers
class EmbeddingNetworkBuilder: """ Basically a wrapper around sklearns LSH forest """ def __init__(self, lsh_init=None): if lsh_init == None: self._lsh_forest = LSHForest(n_estimators=25, n_candidates=1000) else: self._lsh_forest = lsh_init self.iw = None self.m = None def fit_lsh_forest(self, embedding): self._lsh_forest.fit(embedding.m) self._embedding = embedding def extract_nn_network(self, nn=20): dir_graph_mat = self._lsh_forest.kneighbors_graph(X=self._embedding.m, n_neighbors=nn+1) return dir_graph_mat def make_undirected(self, dir_graph_mat): nodes = set(range(dir_graph_mat.shape[0])) edges = set([]) for node_i in dir_graph_mat.shape[0]: for node_j in dir_graph_mat[node_i].nonzero()[1]: edges.add((node_i, node_j)) return nodes, edges def get_forest(self): return self._lsh_forest def get_node_to_word(self): return self.iw
def single_batch(self, tweets): """Performs an approximate nearest neighbors search on tweets in the database passed to it. The database must be a list of tweets (text of the tweets only). Returns the indices of tweets with nearby neighbors (i.e. spam tweets). These indices correspond to indices within the batch of tweets fed to this function.""" # Vectorize and fit tree: vect2 = CountVectorizer(stop_words = self.common_twitter_handles) X2 = vect2.fit_transform(tweets) tree2 = LSHForest() tree2.fit(X2) # Build tree: n_neighbors = [] neighbors_indices = [] for x in vect2.transform(tweets): if len(n_neighbors) % 100 == 0: print "%r tweets analyzed out of %r for this batch" % (len(n_neighbors), self.batch_size) neighbors = tree2.radius_neighbors(x, radius = .4)[1] n_neighbors.append(len(neighbors[0])) neighbors_indices.append(neighbors) neighbors_indices = [x for x in range(len(neighbors_indices)) if len(neighbors_indices[x][0]) > 2] return neighbors_indices
def single_batch(self, tweets): """Performs an approximate nearest neighbors search on tweets in the database passed to it. The database must be a list of tweets (text of the tweets only). Returns the indices of tweets with nearby neighbors (i.e. spam tweets). These indices correspond to indices within the batch of tweets fed to this function.""" # Vectorize and fit tree: vect2 = CountVectorizer(stop_words = self.custom_stop_words) X2 = vect2.fit_transform(tweets) tree2 = LSHForest() tree2.fit(X2) # Build tree: n_neighbors = [] neighbors_indices = [] working_batch_size = len(tweets) for x in vect2.transform(tweets): if len(n_neighbors) % 100 == 0: print "%r tweets analyzed out of %r for this batch" % (len(n_neighbors), working_batch_size) # Only deal with tweets that are longer than 3 words. neighbors = tree2.radius_neighbors(x, radius = self.sensitivity)[1] if x.getnnz() > 2: n_neighbors.append(len(neighbors[0])) neighbors_indices.append(neighbors) else: n_neighbors.append(1) neighbors_indices.append(np.array([np.array([0])])) neighbors_indices = [x for x in range(len(neighbors_indices)) if len(neighbors_indices[x][0]) > 2] return neighbors_indices
def __init__(self, lsh_init=None): if lsh_init == None: self._lsh_forest = LSHForest(n_estimators=25, n_candidates=1000) else: self._lsh_forest = lsh_init self.iw = None self.m = None
def train(): # 构建匹配语料库 398872 samples sku_names_texts = get_train_datas() sku_names_jieba = get_text_jieba(sku_names_texts) sku_names_with_spaces = [] for sku_names in sku_names_jieba: sku_names_with_spaces.append(' '.join(sku_names)) # 测试数据 1000 samples keywords_texts = get_test_datas() keywords_jieba = get_text_jieba(keywords_texts) keywords_with_spaces = [] for keywords in keywords_jieba: keywords_with_spaces.append(' '.join(keywords)) tfidf_vec = TfidfVectorizer(min_df=3, max_features=None, ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1) x_train = tfidf_vec.fit_transform(sku_names_with_spaces) lshf = LSHForest(random_state=42) #lshf.fit(np.array(x_train)) lshf.fit(x_train) for i, kw in enumerate(keywords_with_spaces): x_test = tfidf_vec.transform([kw]) distances, indices = lshf.kneighbors(x_test.toarray(), n_neighbors=1) idx = indices[0][0] print(i, "||", keywords_texts[i], "||", sku_names_texts[idx]) with open("result/lsh_v1_results.txt", 'a', encoding='utf8') as wf: wf.write(str(i) + "||" + keywords_texts[i] + "||" + sku_names_texts[idx] + "\n")
class Index(BaseIndex): """ LSH Forest Index """ name = 'lsh_forest' def _fit(self, xs): """ Fit index :param samples: list of Samples :return: """ self.index = LSHForest( n_estimators=self.parameters.get('n_estimators', 20)) self.index.fit(xs) def _query(self, sample, k=5, **kwargs): """ Query index :param sample: Sample :param k: :param kwargs: :return: """ x, _, = self.transform([sample]) distances, idxs = self.index.kneighbors(x, n_neighbors=k + 1) neighbors = [] for idx, d in zip(idxs[0], distances[0]): hashval = self.ys[idx] neighbors.append({ 'hashval': hashval, 'similarity': min(1 - float(d), 1.0) }) return neighbors
class ScikitLearnLsh(NearestNeighborAlgorithm): """ This ``NearestNeighborAlgorithm`` uses scikit-learn's implementation of a locality sensitive hash to find approximate nearest neighbors. Parameters ---------- random_state: int, optional (default=12345) Used to initialize the LSHForest, so that runs are consistent. """ def __init__(self, params: Dict[str, Any]): random_state = params.pop('random_state', 12345) self.lsh = LSHForest(random_state=random_state) def fit(self, vectors: List[numpy.array]): logger.info("Fitting LSH with %d vectors", len(vectors)) self.lsh.fit(vectors) def get_neighbors(self, query_vector: numpy.array, num_neighbors: int) -> List[Tuple[int, float]]: if len(query_vector.shape) == 1: query_vector = [query_vector] logger.info("Getting neighbors for %d vectors", len(query_vector)) scores, neighbor_indices = self.lsh.kneighbors( query_vector, n_neighbors=num_neighbors) logger.info("Neighbors retrieved") result = [ zip(neighbor_indices[i], scores[i]) for i in range(len(neighbor_indices)) ] if len(result) == 1: result = result[0] return result
def CreateAndconfigureLSHForest(categories): # categories - dict: {name; vector} print("Creating LSHForest...") catArray = numpy.array(list(categories.values())) lshf = LSHForest(n_candidates=70, n_estimators=30, n_neighbors=TOP_N_COUNT) lshf.fit(catArray) print("LSHForest was created") return lshf
def search_neighbors(request): designs = Design.objects.all() image_list = [] for design in designs: image_list.append(str(design.uid) + ".png") d_geometry = settings.D_GEOMETRY designed_images = np.empty((len(image_list), d_geometry[0]*d_geometry[1]*3), dtype="float32") for i in range(len(image_list)): designed_images[i] = img2numpy_arr(settings.DESIGN_PATH + image_list[i]).reshape(d_geometry[0]*d_geometry[1]*3) designed_images /= 255 lshf = LSHForest(random_state=42) lshf.fit(designed_images) num = int(request.GET['num']) input_fname = str(request.GET['input']) input_image = img2numpy_arr(settings.DESIGN_PATH + input_fname) input_image = input_image.reshape(1, -1)/255 _, indices = lshf.kneighbors(input_image, n_neighbors=num) similar_images = [] for i in list(indices.reshape(-1)): similar_images.append({ "image": str(designs[i].uid) + ".png", "text": str(designs[i].history_text), "like": int(designs[i].like), "filtered": str(designs[i].filtered) }) return JsonResponse({ "results": similar_images })
def Classify(nlp, keywords, categories): #keywords - list; categories - dict: {name; vector} counterDict = Counter(keywords) #optimization for keywords duplicates sumVector = numpy.zeros(nlp.vocab.vectors_length) #temp text = ' '.join(keywords) for word, repCount in counterDict.items(): #summurizing words vectors curVect = nlp(word).vector sumVector += (curVect * repCount) vec = nlp(text).vector sim = cosine_similarity(vec, sumVector) print("Sim: " + str(sim)) catArray = numpy.array(list(categories.values())) catKeys = list(categories.keys()) #tree = KDTree(catArray, metric='pyfunc', func=cosine_similarity) #dist, ind = tree.query(sumVector, k=TOP_N_COUNT) #.reshape(-1, 1) print("Creating LSHForest...") lshf = LSHForest(n_candidates=70, n_estimators=30, n_neighbors=TOP_N_COUNT) lshf.fit(catArray) print("LSHForest was created") print("Getting neighbors...") distances, indices = lshf.kneighbors(sumVector.reshape((1, -1))) print("Got neighbors.") for curIndex in numpy.nditer(indices): print("Found category: " + str(catKeys[curIndex])) print("with distance: " + str(distances))
def persist_attraction_similarities_to_db(): # build LSHForest model for reduced dimension dataset svd = TruncatedSVD(n_components=10, n_iter=7) red_dim_itemuserdf = svd.fit_transform(itemuserdf) item_user_model = LSHForest() item_user_model.fit(red_dim_itemuserdf) # persist attractions similarities to db K=20 # query for K neighbors k=10 # return k neighbors for i in range(itemuserdf.shape[0]): distance, indices = item_user_model.kneighbors( red_dim_itemuserdf[i].reshape(1, -1), n_neighbors=K ) weights = 1 - distance for j in range(k): if i != indices[0][j]: e = SimilarAttractions( attraction_id=Attraction.objects.filter( app_id=int(i)).values('attraction_id')[0]['attraction_id'], similar_attraction_id=Attraction.objects.filter( app_id=int(indices[0][j])).values('attraction_id')[0]['attraction_id'], similarity=weights[0][j], ts=timezone.now() ) e.save()
def test_fit(): """Checks whether `fit` method sets all attribute values correctly.""" n_samples = 12 n_features = 2 n_estimators = 5 rng = np.random.RandomState(42) X = rng.rand(n_samples, n_features) lshf = LSHForest(n_estimators=n_estimators) lshf.fit(X) # _input_array = X assert_array_equal(X, lshf._fit_X) # A hash function g(p) for each tree assert_equal(n_estimators, len(lshf.hash_functions_)) # Hash length = 32 assert_equal(32, lshf.hash_functions_[0].components_.shape[0]) # Number of trees_ in the forest assert_equal(n_estimators, len(lshf.trees_)) # Each tree has entries for every data point assert_equal(n_samples, len(lshf.trees_[0])) # Original indices after sorting the hashes assert_equal(n_estimators, len(lshf.original_indices_)) # Each set of original indices in a tree has entries for every data point assert_equal(n_samples, len(lshf.original_indices_[0]))
def get_nearest_neighbor_iterable(self, graphlist, start_graphs, start_is_subset=True): # vectorize all graphlist = list(graphlist) graphlist_ = copy.deepcopy(graphlist) X = self.vectorizer.transform_single(graphlist_) start_graphs = list(start_graphs) graphlist_ = copy.deepcopy(start_graphs) Y = self.vectorizer.transform_single(graphlist_) forest = LSHForest() forest.fit(X) #http://scikit-learn.org/stable/modules/neighbors.html distances, indices = forest.kneighbors(Y, n_neighbors=2) # we just assume that this is short... index = 0 if start_is_subset: index += 1 #matches= ( X_index ,Y_index, distance ) matches = [(indices[i, index], i, distances[i, index]) for i in range(len(indices))] matches.sort() # this looks super confusing.... #for index, graph in enumerate(selection_iterator(graphlist, [a[0] for a in matches])): # yield ((graph, start_graphs[matches[index][1]], X[matches[index][0]])) # so i wrote this:,,, you may even get rid of the matches variable i think.. and use indices directly for Xi, Yi, dist in matches: yield ((start_graphs[Yi], graphlist[Xi], X[Xi]))
def test_distances(): """Checks whether returned neighbors are from closest to farthest.""" n_samples = 12 n_features = 2 n_iter = 10 rng = np.random.RandomState(42) X = rng.rand(n_samples, n_features) lshf = LSHForest() lshf.fit(X) for i in range(n_iter): n_neighbors = rng.randint(0, n_samples) query = X[rng.randint(0, n_samples)] distances, neighbors = lshf.kneighbors(query, n_neighbors=n_neighbors, return_distance=True) # Returned neighbors should be from closest to farthest. assert_true(np.all(np.diff(distances[0]) >= 0)) mean_dist = np.mean(pairwise_distances(query, X, metric='cosine')) distances, neighbors = lshf.radius_neighbors(query, radius=mean_dist, return_distance=True) assert_true(np.all(np.diff(distances[0]) >= 0))
def test_hash_functions(): """Checks randomness of hash functions. Variance and mean of each hash function (projection vector) should be different from flattened array of hash functions. If hash functions are not randomly built (seeded with same value), variances and means of all functions are equal. """ n_samples = 12 n_features = 2 n_estimators = 5 rng = np.random.RandomState(42) X = rng.rand(n_samples, n_features) lshf = LSHForest(n_estimators=n_estimators, random_state=rng.randint(0, np.iinfo(np.int32).max)) lshf.fit(X) hash_functions = [] for i in range(n_estimators): hash_functions.append(lshf.hash_functions_[i].components_) for i in range(n_estimators): assert_not_equal(np.var(hash_functions), np.var(lshf.hash_functions_[i].components_)) for i in range(n_estimators): assert_not_equal(np.mean(hash_functions), np.mean(lshf.hash_functions_[i].components_))
def score(factors): verifyCount = 3 X, y = Sets.trainingSet test_set, databases = Sets.testSet X = FactorizeVectors(X, factors) test_set = FactorizeVectors(test_set, factors) correctionAverage = 0 for i in range(verifyCount): best_predictions = 0 clf = LSHForest(n_estimators = 10, n_candidates = 10) clf.fit(X) correct = 0 total = 0 for j in range(len(test_set)): total += 1 actual = databases[j] distances, indices = clf.kneighbors(test_set[j], n_neighbors=5) predicted = GetPrediction(y, distances[0], indices[0]) if (actual == predicted): correct += 1 if (correct > best_predictions): best_predictions = correct correctionAverage += best_predictions correctionAverage = float(correctionAverage)/verifyCount return correctionAverage
def runForestLSHSizeAnalysis(argsdict, data, inlbl, fPath, fName, fileN, i): start = time.time() tree = LSHForest(random_state=42) tree.fit(data) end = time.time() return sys.getsizeof(tree), (end - start)
def test_neighbors_accuracy_with_n_estimators(): # Checks whether accuracy increases as `n_estimators` increases. n_estimators = np.array([1, 10, 100]) n_samples = 100 n_features = 10 n_iter = 10 n_points = 5 rng = np.random.RandomState(42) accuracies = np.zeros(n_estimators.shape[0], dtype=float) X = rng.rand(n_samples, n_features) for i, t in enumerate(n_estimators): lshf = LSHForest(n_candidates=500, n_estimators=t) ignore_warnings(lshf.fit)(X) for j in range(n_iter): query = X[rng.randint(0, n_samples)].reshape(1, -1) neighbors = lshf.kneighbors(query, n_neighbors=n_points, return_distance=False) distances = pairwise_distances(query, X, metric='cosine') ranks = np.argsort(distances)[0, :n_points] intersection = np.intersect1d(ranks, neighbors).shape[0] ratio = intersection / float(n_points) accuracies[i] = accuracies[i] + ratio accuracies[i] = accuracies[i] / float(n_iter) # Sorted accuracies should be equal to original accuracies assert_true(np.all(np.diff(accuracies) >= 0), msg="Accuracies are not non-decreasing.") # Highest accuracy should be strictly greater than the lowest assert_true(np.ptp(accuracies) > 0, msg="Highest accuracy is not strictly greater than lowest.")
def get_nearest_neighbor_iterable(self, graphlist, start_graphs, start_is_subset=True): # vectorize all graphlist= list(graphlist) graphlist_ = copy.deepcopy(graphlist) X = self.vectorizer.transform_single(graphlist_) start_graphs= list(start_graphs) graphlist_= copy.deepcopy(start_graphs) Y = self.vectorizer.transform_single(graphlist_) forest = LSHForest() forest.fit(X) #http://scikit-learn.org/stable/modules/neighbors.html distances, indices = forest.kneighbors(Y, n_neighbors=2) # we just assume that this is short... index = 0 if start_is_subset: index += 1 #matches= ( X_index ,Y_index, distance ) matches = [(indices[i, index], i, distances[i, index]) for i in range(len(indices))] matches.sort() # this looks super confusing.... #for index, graph in enumerate(selection_iterator(graphlist, [a[0] for a in matches])): # yield ((graph, start_graphs[matches[index][1]], X[matches[index][0]])) # so i wrote this:,,, you may even get rid of the matches variable i think.. and use indices directly for Xi,Yi,dist in matches: yield ((start_graphs[Yi],graphlist[Xi],X[Xi]))
def get_heap_and_forest(self, griter, k): ''' so we create the heap and the forest... heap is (dist to hyperplane, count, graph) and the forest ist just a nearest neighbor from sklearn ''' graphs = list(griter) graphs2 = copy.deepcopy(graphs) # transform doess mess up the graph objects X = self.vectorizer.transform(graphs) forest = LSHForest() forest.fit(X) print 'got forest' heap = [] for vector, graph in zip(X, graphs2): graph2 = nx.Graph(graph) heapq.heappush(heap, ( self.sampler.estimator.predict_proba(self.sampler.vectorizer.transform_single(graph2))[0][1], # score ~ dist from hyperplane k + 1, # making sure that the counter is high so we dont output the startgraphz at the end graph)) # at last the actual graph print 'got heap' distances, unused = forest.kneighbors(X, n_neighbors=2) distances = [a[1] for a in distances] # the second element should be the dist we want avg_dist = distances[len(distances) / 2] # sum(distances)/len(distances) print 'got dist' return heap, forest, avg_dist
def optimise(self, num_train_points, num_val_points, parameters): max_accuracy = -1 optimal_estimators = -1 optimal_n_neighbours = -1 for item in self.get_generator(parameters): LSHf = LSHForest(random_state=42, n_estimators=item['n_est'], n_neighbors=item['n_neigh']) LSHf.fit(self.train.images[:num_train_points]) distances, indices = LSHf.kneighbors( self.validation.images[:num_val_points], n_neighbors=5) accuracy, positions = self.model_accuracy(indices, is_optimising=True) if accuracy > max_accuracy: max_accuracy = accuracy optimal_estimators = item['n_est'] optimal_n_neighbours = item['n_neigh'] # print(optimal_n_neighbours_predict) return max_accuracy, optimal_estimators, optimal_n_neighbours
def single_batch(self, tweets): """Performs an approximate nearest neighbors search on tweets in the database passed to it. The database must be a list of tweets (text of the tweets only). Returns the indices of tweets with nearby neighbors (i.e. spam tweets). These indices correspond to indices within the batch of tweets fed to this function.""" # Vectorize and fit tree: vect2 = CountVectorizer(stop_words=self.common_twitter_handles) X2 = vect2.fit_transform(tweets) tree2 = LSHForest() tree2.fit(X2) # Build tree: n_neighbors = [] neighbors_indices = [] for x in vect2.transform(tweets): if len(n_neighbors) % 100 == 0: print "%r tweets analyzed out of %r for this batch" % ( len(n_neighbors), self.batch_size) neighbors = tree2.radius_neighbors(x, radius=.4)[1] n_neighbors.append(len(neighbors[0])) neighbors_indices.append(neighbors) neighbors_indices = [ x for x in range(len(neighbors_indices)) if len(neighbors_indices[x][0]) > 2 ] return neighbors_indices
def __init__(self, params: Dict[str, Any]): # Location of corpus to use for background knowledge search. This corpus is assumed to be # gzipped, one sentence per line. self.corpus_path = params.pop('corpus_path', None) # Number of background sentences to collect for each input. self.num_background = params.pop('num_background', 10) # Wait this many epochs before running differentiable search. This lets you train with the # base memory network code using external background knowledge for a time, then, once the # encoder is trained sufficiently, you can turn on the differentiable search. self.num_epochs_delay = params.pop('num_epochs_delay', 10) # Number of epochs we wait in between re-encoding the corpus. # TODO(matt): consider only re-encoding at early stopping, instead of a # number-of-epoch-based parameter. self.num_epochs_per_encoding = params.pop('num_epochs_per_encoding', 2) # Only meaningful if you are loading a model. When loading, should we load a pickled LSH, # or should we re-initialize the LSH from the input corpus? Note that if you give a corpus # path, and you load a saved LSH that was constructed from a _different_ corpus, you could # end up with really weird behavior. self.load_saved_lsh = params.pop('load_saved_lsh', False) # Now that we've popped our parameters, we can call the superclass constructor. super(DifferentiableSearchMemoryNetwork, self).__init__(params) # And then set some member variables. self._sentence_encoder_model = self.__build_sentence_encoder_model() self.lsh = LSHForest(random_state=12345) self.instance_index = {} # type: Dict[int, str]
def test_neighbors_accuracy_with_n_estimators(): """Checks whether accuracy increases as `n_estimators` increases.""" n_estimators = np.array([1, 10, 100]) n_samples = 100 n_features = 10 n_iter = 10 n_points = 5 rng = np.random.RandomState(42) accuracies = np.zeros(n_estimators.shape[0], dtype=float) X = rng.rand(n_samples, n_features) for i, t in enumerate(n_estimators): lshf = LSHForest(n_candidates=500, n_estimators=t) lshf.fit(X) for j in range(n_iter): query = X[rng.randint(0, n_samples)] neighbors = lshf.kneighbors(query, n_neighbors=n_points, return_distance=False) distances = pairwise_distances(query, X, metric='cosine') ranks = np.argsort(distances)[0, :n_points] intersection = np.intersect1d(ranks, neighbors).shape[0] ratio = intersection / float(n_points) accuracies[i] = accuracies[i] + ratio accuracies[i] = accuracies[i] / float(n_iter) # Sorted accuracies should be equal to original accuracies assert_true(np.all(np.diff(accuracies) >= 0), msg="Accuracies are not non-decreasing.") # Highest accuracy should be strictly greater than the lowest assert_true(np.ptp(accuracies) > 0, msg="Highest accuracy is not strictly greater than lowest.")
def build_index(data, n_estimators=20, n_candidates=100, n_neighbors=10, seed=0): lshf = LSHForest(n_estimators=n_estimators, n_candidates=n_candidates, n_neighbors=n_neighbors, random_state=seed) t0 = time() lshf.fit(data) duration = time() - t0 return lshf, duration
def fit_model(self, data, n_estimators, n_neighbours): LSHf = LSHForest(random_state=42, n_estimators=n_estimators, n_neighbors=n_neighbours) LSHf.fit(data) return LSHf
def create_tree(self,listNames,variableName): #LSHForest. only once for the main database lshf = LSHForest(n_estimators=50,n_candidates=500) TF, tfidfs = self.create_TDIDF(self.tokenize(listNames)) lshf.fit(tfidfs) pickle.dump(lshf,open("{0}/{1}_lshf.dump".format(self.folderSaveData,variableName),"wb+")) pickle.dump(listNames,open("{0}/{1}_listNames.dump".format(self.folderSaveData,variableName),"wb+")) pickle.dump(TF,open("{0}/{1}_TF.dump".format(self.folderSaveData,variableName),"wb+"))
def _fit(self, xs): """ Fit index :param samples: list of Samples :return: """ self.index = LSHForest( n_estimators=self.parameters.get('n_estimators', 20)) self.index.fit(xs)
def hash_movie_similarity(um, num_neighbors=6): lsh = LSHForest(random_state=470957) lsh.fit(um.T) # Don't compare to self, remove first column, call 7 neighbors dist, ind = lsh.kneighbors(um.T, n_neighbors=num_neighbors+1, return_distance=True) sim = 1 - dist return sim[:,1:], ind[:,1:]
def test_radius_neighbors_boundary_handling(): X = [[0.999, 0.001], [0.5, 0.5], [0, 1.], [-1., 0.001]] n_points = len(X) # Build an exact nearest neighbors model as reference model to ensure # consistency between exact and approximate methods nnbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X) # Build a LSHForest model with hyperparameter values that always guarantee # exact results on this toy dataset. lsfh = LSHForest(min_hash_match=0, n_candidates=n_points, random_state=42).fit(X) # define a query aligned with the first axis query = [[1., 0.]] # Compute the exact cosine distances of the query to the four points of # the dataset dists = pairwise_distances(query, X, metric='cosine').ravel() # The first point is almost aligned with the query (very small angle), # the cosine distance should therefore be almost null: assert_almost_equal(dists[0], 0, decimal=5) # The second point form an angle of 45 degrees to the query vector assert_almost_equal(dists[1], 1 - np.cos(np.pi / 4)) # The third point is orthogonal from the query vector hence at a distance # exactly one: assert_almost_equal(dists[2], 1) # The last point is almost colinear but with opposite sign to the query # therefore it has a cosine 'distance' very close to the maximum possible # value of 2. assert_almost_equal(dists[3], 2, decimal=5) # If we query with a radius of one, all the samples except the last sample # should be included in the results. This means that the third sample # is lying on the boundary of the radius query: exact_dists, exact_idx = nnbrs.radius_neighbors(query, radius=1) approx_dists, approx_idx = lsfh.radius_neighbors(query, radius=1) assert_array_equal(np.sort(exact_idx[0]), [0, 1, 2]) assert_array_equal(np.sort(approx_idx[0]), [0, 1, 2]) assert_array_almost_equal(np.sort(exact_dists[0]), dists[:-1]) assert_array_almost_equal(np.sort(approx_dists[0]), dists[:-1]) # If we perform the same query with a slightly lower radius, the third # point of the dataset that lay on the boundary of the previous query # is now rejected: eps = np.finfo(np.float64).eps exact_dists, exact_idx = nnbrs.radius_neighbors(query, radius=1 - eps) approx_dists, approx_idx = lsfh.radius_neighbors(query, radius=1 - eps) assert_array_equal(np.sort(exact_idx[0]), [0, 1]) assert_array_equal(np.sort(approx_idx[0]), [0, 1]) assert_array_almost_equal(np.sort(exact_dists[0]), dists[:-2]) assert_array_almost_equal(np.sort(approx_dists[0]), dists[:-2])
class LSH_KNN: def __init__(self, weights='uniform', **kwargs): self.n_neighbors = kwargs['n_neighbors'] self.lsh = LSHForest(**kwargs) self.weights = weights def fit(self, X, y): self.y = y self.X = X self.lsh.fit(X) def predict_top_n(self, test_X, n): _, indices = self.lsh.kneighbors(test_X, n_neighbors=self.n_neighbors) votes = np.zeros((len(test_X), n)) for i in range(len(indices)): votes[i] = np.bincount([self.y[j] for j in indices[i]]).argsort()[-n:][::-1] return votes.astype(int) def predict_proba(self, test_X, return_dists=False): # SMOOTHING PARAMETER TO PREVENT 0 PROBA; https://stats.stacketest_xchange.com/questions/83600/how-to-obtain-the-class-conditional-probability-when-using-knn-classifier s = 0.1 _, neighbor_indices = self.lsh.kneighbors(test_X, n_neighbors=self.n_neighbors) dists = [] proba = np.zeros((len(test_X), np.amatest_x(self.y) + 1)) for test_point in range(len(neighbor_indices)): if self.weights == 'uniform': weights = np.ones(len(neighbor_indices[test_point])) elif self.weights == 'distance': weights = [1 / self.dist(test_X[test_point], self.y[j]) for j in neighbor_indices[test_point]] weighted_class_counts = np.bincount([self.y[j] for j in neighbor_indices[test_point]], weights=weights, minlength=np.amatest_x(self.y)+1) proba[test_point] = np.true_divide(weighted_class_counts + s, np.sum(weighted_class_counts) + len(weighted_class_counts)*s) if return_dists: test_point_dists = {} for neighbor_index in neighbor_indices[test_point]: if self.y[neighbor_index] not in test_point_dists: self.y[neighbor_index] = [] test_point_dists[self.y[neighbor_index]].append(dist(test_X[test_point], self.X[neighbor_index])) dists.append(test_point_dists) if return_dists: return proba, dists return proba def predict(self, test_X): _, neighbor_indices = self.lsh.kneighbors(test_X, n_neighbors=self.n_neighbors) result = np.zeros(len(test_X)) for test_point in range(len(neighbor_indices)): if self.weights == 'uniform': weights = np.ones(len(neighbor_indices[test_point])) elif self.weights == 'distance': weights = [1 / self.dist(test_X[test_point], self.y[j]) for j in neighbor_indices[test_point]] weighted_class_counts = np.bincount([self.y[j] for j in neighbor_indices[test_point]], weights=weights) result[test_point] = np.argmatest_x(weighted_class_counts) return result.astype(int) def dist(self, a, b): return np.linalg.norm(a - b)
def trainLSH(train, test, val): n_feat = train[0].size train_data = train[:, :-1] train_labels = train[:, n_feat - 1] val_data = val[:, :-1] val_labels = val[:, n_feat - 1] lshf = LSHForest(random_state=42) lshf.fit(train_data) countarrLSH = lshFunct(test, val, n_feat, lshf, train_labels) return countarrLSH
def __vectorize_corpus(self): self.lsh = LSHForest(n_estimators=200, n_neighbors=self.num_topics) self.vectorized_docs = [] for text in self.texts: bow = self.dictionary.doc2bow(text) vectorized_doc = [x[1] for x in self.model.get_document_topics(bow, minimum_probability=0.0)] self.vectorized_docs.append(vectorized_doc) self.lsh.fit(self.vectorized_docs)
def BuildModel(self, data, labels): # Create and train the classifier. lshf = LSHForest(n_estimators = self.n_estimators, min_hash_match = self.min_hash_match, n_candidates = self.n_candidates, radius_cutoff_ratio = self.radius_cutoff_ratio, radius = self.radius, n_neighbors = self.n_neighbors) lshf.fit(data) return lshf
class LSHForestSearch: def __init__(self, features, k): self.lshf = LSHForest(n_estimators=1, n_candidates=1, n_neighbors=k) self.k = k self.lshf.fit(features) def search(self, features): return self.lshf.kneighbors(features, return_distance=False, n_neighbors=self.k)
def __init__(self, docs): self.lshf = LSHForest(n_estimators=1, n_candidates=1, n_neighbors=1) self.dv = DictVectorizer() dicts = [] for d in docs: dicts.append(dict([(w, 1) for w in d])) self.dv.fit(dicts) features = self.dv.transform(dicts) # floats are faster # features = csr_matrix(features, dtype=int) self.lshf.fit(features)
class LHSForestEngine: def __init__(self): self.engine = LSHForest(random_state=42) self.name = "LHS" def fit(self, data): self.engine.fit(data) def dist(self, data): distances, indices = self.engine.kneighbors(data, n_neighbors=1) return distances.ravel()
def calculate_duplication_number(self,text_list): print "length is ", len(text_list) tf_vectorizer = CountVectorizer(stop_words=None,analyzer='word',ngram_range=(5,5)) #print text_list tf = tf_vectorizer.fit_transform(text_list) #print tf_vectorizer.get_feature_names() print tf[0] #print tf[123] lshf = LSHForest() #print tf lshf.fit(tf) distance,index = lshf.kneighbors(tf,n_neighbors=1) print distance, index
def startQuery(): while True: try: ipt = raw_input('Directory of query:') except ImportError: print 'invalid type' else: query = ipt if query == 'exit()': break print 'loading query...' try: token = get_tokens_by_dir(query) except IOError: print 'invalid file name' else: ##########################################query preprocessing print 'query pre-processing...' stopped_tokens = [i for i in token if not i in en_stop] p_stemmer = PorterStemmer() stemed_tokens = [] for i in stopped_tokens: try: temp_token = str(p_stemmer.stem(i)) stemed_tokens.append(temp_token) except IndexError: pass tokens = [stemed_tokens] ###################################################################################### dictionary_new = corpora.Dictionary(tokens) corpus_new = [dictionary_new.doc2bow(text) for text in tokens] QUERY_TOPIC = np.zeros([1,num_topic]) ## topic vector for query new_topics = LDA[corpus_new] for i in new_topics[0]: print(i) QUERY_TOPIC[0,i[0]] = i[1] ##assign new topics to query doc-topic matrix print 'fetching results for you...' lshf = LSHForest(random_state=42) lshf.fit(DOC_TOPICS) ##fit the local sensitive hash forest with training data POINT_SET dist,indices=lshf.kneighbors(QUERY_TOPIC,n_neighbors=20) print indices
def fit_lshf(data): logger.info('Fitting LSHForest...') from sklearn.neighbors import LSHForest lshf = LSHForest( n_estimators=20, min_hash_match=4, n_candidates=200, n_neighbors=2, radius=1.0, radius_cutoff_ratio=0.9, random_state=None, ) lshf.fit(data) return lshf
def fit_model(self, model_type='brute', params=None): ''' fits model operating under the assumption that there's a model already built ''' if model_type == 'brute': self.model = NearestNeighbors(algorithm='brute', **params) elif model_type == 'lsh': self.model = LSHForest( **params) # elif model_type == 'annoy': # self.model = Annoy(**params) self.model.fit(self.vector_space) print self.model
def __init__(self): self.unknown = '' self.same_person_num = 1 self.has_cal_dist = [] self.NeighbourNum = 10 # 如果管理员加载图片, 把图片放到all_pic_data_folder下指定人的目录(图片文件和特征文件的文件名相同) self.all_pic_feature_data_folder = '/data/liubo/face/research_feature_self' # 研究院的模型直接存储特征 # 保存图片可以方便以后查看效果, 方便前端显示, 也方便管理员进行标注 self.all_pic_data_folder = '/data/liubo/face/research_self' if not os.path.exists(self.all_pic_data_folder): os.makedirs(self.all_pic_data_folder) if not os.path.exists(self.all_pic_feature_data_folder): os.makedirs(self.all_pic_feature_data_folder) self.n_neighbors = 10 self.lshf = LSHForest(n_estimators=20, n_candidates=200, n_neighbors=self.n_neighbors) self.all_labels = [] self.all_pic_feature = [] self.same_pic_id = 2 self.must_be_same_id = 1 self.must_be_not_same_id = 0 self.maybe_same_id = 3 self.new_person_str = 'new_person_' self.current_new_person_id = self.find_current_new_person_id() self.must_same_str = '_Must_Same' self.maybe_same_str = '_Maybe_same' self.load_time = time.time() self.user_count = {} self.upper_threshold = upper_verif_threshold self.lower_threshold = lower_verif_threshold self.same_pic_threshold = same_pic_threshold self.trans_dic = {self.same_pic_id: 'same_pic', self.must_be_same_id: 'must_same_id', self.must_be_not_same_id: 'must_not_same_id', self.maybe_same_id: 'maybe_same_id'} self.nearest = deque(maxlen=nearest_num) self.verification_same_person = 0
def __init__(self): self.unknown = '' self.same_person_num = 1 self.has_save_pic_feature = [] self.has_cal_dist = [] self.NeighbourNum = 10 self.all_pic_data_folder = '/data/liubo/face/self' self.other_dataset_para_add = 1 self.n_neighbors = 5 self.lshf = LSHForest(n_estimators=20, n_candidates=200, n_neighbors=self.n_neighbors) self.all_labels = [] self.all_pic_feature = [] self.same_pic_id = 2 self.must_be_same_id = 1 self.must_be_not_same_id = 0 self.maybe_same_id = 3 self.new_person_str = 'new_person_' self.current_new_person_id = self.find_current_new_person_id() self.must_same_str = '_Must_Same' self.maybe_same_str = '_Maybe_same' self.load_time = time.time() self.user_count = {} # 不同的模型阈值不相同 self.upper_threshold = upper_verif_threshold self.lower_threshold = lower_verif_threshold self.same_pic_threshold = same_pic_threshold self.pitch_threshold = 20 self.yaw_threshold = 20 self.roll_threshold = 20 # [(time, feature),...,(time, feature)] : 根据时间计算当前图片与前5张图片的相似度(如果时间相差很多, 不在计算) self.nearest = deque(maxlen=nearest_num) self.trans_dic = {self.same_pic_id: 'same_pic', self.must_be_same_id: 'must_same_id', self.must_be_not_same_id: 'must_not_same_id', self.maybe_same_id: 'maybe_same_id'} self.verification_same_person = 0
def vectorized(self, num_topics=DefaultSetting.NUMBER_TOPICS): self.lsh = LSHForest(n_estimators=DefaultSetting.HASH_SIZE, n_neighbors=10) docs_bow = [self.doc_corpus.dictionary.doc2bow(content.split(u' ')) for content in self.doc_corpus.documents] for doc_bow in docs_bow: vectorized_doc = [x[1] for x in self.model.get_document_topics(doc_bow, minimum_probability=0.0)] self.vectorized_docs.append(vectorized_doc) self.lsh.fit(self.vectorized_docs)
def cal_acc(pack_file, stat_file, feature_dim): f = open(stat_file, 'w') f.write('train_pic_num'+'\t'+'person_name'+'\t'+'acc'+'\n') pic_num = range(1, max_person_num) for num in pic_num: all_train_data, all_train_label, all_valid_data, all_valid_label = split_train_valid(pack_file, train_pic_num=num, feature_dim=feature_dim) lshf = LSHForest(n_estimators=20, n_candidates=200, n_neighbors=5) for index in range(len(all_train_data)): try: if all_train_data[index] == None: continue lshf.partial_fit(all_train_data[index], all_train_label[index]) except: traceback.print_exc() continue # 对于每个人,分别统计准确率 person_acc_dic = {} # 准确的个数 person_all_dic = {} # 总的个数 filter_num = 0 all_num = 0 for index in range(len(all_valid_data)): try: if all_valid_data[index] == None: continue all_find_distance, all_find_index = lshf.kneighbors(all_valid_data[index], n_neighbors=5, return_distance=True) cos_sim = cosine_similarity(all_valid_data[index], all_train_data[all_find_index[0, 0]]) label = all_train_label[all_find_index[0, 0]] # if cos_sim > sim_threshold: if True: if label == all_valid_label[index]: person_acc_dic[label] = person_acc_dic.get(label, 0) + 1 person_all_dic[label] = person_all_dic.get(label, 0) + 1 else: person_all_dic[label] = person_all_dic.get(label, 0) + 1 else: filter_num += 1 all_num += 1 except: print all_valid_label[index] continue print 'train_num :', num, 'filter_rate: ', (filter_num * 1.0 / all_num) for person in person_all_dic: all_num = person_all_dic[person] right_num = person_acc_dic.get(person, 0) f.write('\t'.join(map(str, [num, person, (right_num * 1.0 / all_num)]))+'\n')
def test_partial_fit(): """Checks whether inserting array is consitent with fitted data. `partial_fit` method should set all attribute values correctly. """ n_samples = 12 n_samples_partial_fit = 3 n_features = 2 rng = np.random.RandomState(42) X = rng.rand(n_samples, n_features) X_partial_fit = rng.rand(n_samples_partial_fit, n_features) lshf = LSHForest() # Test unfitted estimator lshf.partial_fit(X) assert_array_equal(X, lshf._fit_X) lshf.fit(X) # Insert wrong dimension assert_raises(ValueError, lshf.partial_fit, np.random.randn(n_samples_partial_fit, n_features - 1)) lshf.partial_fit(X_partial_fit) # size of _input_array = samples + 1 after insertion assert_equal(lshf._fit_X.shape[0], n_samples + n_samples_partial_fit) # size of original_indices_[1] = samples + 1 assert_equal(len(lshf.original_indices_[0]), n_samples + n_samples_partial_fit) # size of trees_[1] = samples + 1 assert_equal(len(lshf.trees_[1]), n_samples + n_samples_partial_fit)
def test_graphs(): # Smoke tests for graph methods. n_samples_sizes = [5, 10, 20] n_features = 3 rng = np.random.RandomState(42) for n_samples in n_samples_sizes: X = rng.rand(n_samples, n_features) lshf = LSHForest(min_hash_match=0) ignore_warnings(lshf.fit)(X) kneighbors_graph = lshf.kneighbors_graph(X) radius_neighbors_graph = lshf.radius_neighbors_graph(X) assert_equal(kneighbors_graph.shape[0], n_samples) assert_equal(kneighbors_graph.shape[1], n_samples) assert_equal(radius_neighbors_graph.shape[0], n_samples) assert_equal(radius_neighbors_graph.shape[1], n_samples)
def test_kneighbors(): """Checks whether desired number of neighbors are returned. It is guaranteed to return the requested number of neighbors if `min_hash_match` is set to 0. Returned distances should be in ascending order. """ n_samples = 12 n_features = 2 n_iter = 10 rng = np.random.RandomState(42) X = rng.rand(n_samples, n_features) lshf = LSHForest(min_hash_match=0) # Test unfitted estimator assert_raises(ValueError, lshf.kneighbors, X[0]) lshf.fit(X) for i in range(n_iter): n_neighbors = rng.randint(0, n_samples) query = X[rng.randint(0, n_samples)] neighbors = lshf.kneighbors(query, n_neighbors=n_neighbors, return_distance=False) # Desired number of neighbors should be returned. assert_equal(neighbors.shape[1], n_neighbors) # Multiple points n_queries = 5 queries = X[rng.randint(0, n_samples, n_queries)] distances, neighbors = lshf.kneighbors(queries, n_neighbors=1, return_distance=True) assert_equal(neighbors.shape[0], n_queries) assert_equal(distances.shape[0], n_queries) # Test only neighbors neighbors = lshf.kneighbors(queries, n_neighbors=1, return_distance=False) assert_equal(neighbors.shape[0], n_queries) # Test random point(not in the data set) query = rng.randn(n_features) lshf.kneighbors(query, n_neighbors=1, return_distance=False) # Test n_neighbors at initialization neighbors = lshf.kneighbors(query, return_distance=False) assert_equal(neighbors.shape[1], 5) # Test `neighbors` has an integer dtype assert_true(neighbors.dtype.kind == 'i', msg="neighbors are not in integer dtype.")
def lshf_scikit(data, n_neighbors=4, n_estimators=10, min_hash_match=4, n_candidates=10, random_state=None): n_neighbors += 1 # initialize nearest neighbor model nbrs = LSHForest(n_neighbors=n_neighbors, n_estimators = 10, min_hash_match = 4, n_candidates = 10, random_state = 0) # fit nearest neighbor model to the data nbrs.fit(data) # return the distances and indices return nbrs.kneighbors(data)
def cal_recall(pack_file, stat_file, feature_dim): # f_model = open('verf.txt', 'w') f = open(stat_file, 'w') f.write('train_pic_num'+'\t'+'person_name'+'\t'+'recall'+'\n') pic_num = range(1, max_person_num) for num in pic_num: all_train_data, all_train_label, all_valid_data, all_valid_label = split_train_valid(pack_file, train_pic_num=num, feature_dim=feature_dim) lshf = LSHForest(n_estimators=20, n_candidates=200, n_neighbors=5) for index in range(len(all_train_data)): try: if all_train_data[index] == None: continue lshf.partial_fit(all_train_data[index], all_train_label[index]) except: continue # 对于每个人,分别统计准确率 person_find_dic = {} # 准确的个数 person_all_dic = {} # 总的个数 for index in range(len(all_valid_data)): try: if all_valid_data[index] == None: continue all_find_distance, all_find_index = lshf.kneighbors(all_valid_data[index], n_neighbors=5, return_distance=True) cos_sim = cosine_similarity(all_valid_data[index], all_train_data[all_find_index[0, 0]]) label = all_train_label[all_find_index[0, 0]] real_label = all_valid_label[index] # if cos_sim > sim_threshold: if True: if label == real_label: # f_model.write('0'+'\t'+str(cos_sim)+'\n') person_find_dic[real_label] = person_find_dic.get(real_label, 0) + 1 person_all_dic[real_label] = person_all_dic.get(real_label, 0) + 1 else: # f_model.write('1' + '\t' + str(cos_sim) + '\n') person_all_dic[real_label] = person_all_dic.get(real_label, 0) + 1 except: print all_valid_label[index] continue print 'train_num :', num for person in person_all_dic: all_num = person_all_dic[person] right_num = person_find_dic.get(person, 0) f.write('\t'.join(map(str, [num, person, (right_num * 1.0 / all_num)]))+'\n')
def test_radius_neighbors(): """Checks whether Returned distances are less than `radius` At least one point should be returned when the `radius` is set to mean distance from the considering point to other points in the database. Moreover, this test compares the radius neighbors of LSHForest with the `sklearn.neighbors.NearestNeighbors`. """ n_samples = 12 n_features = 2 n_iter = 10 rng = np.random.RandomState(42) X = rng.rand(n_samples, n_features) lshf = LSHForest() # Test unfitted estimator assert_raises(ValueError, lshf.radius_neighbors, X[0]) lshf.fit(X) for i in range(n_iter): query = X[rng.randint(0, n_samples)] mean_dist = np.mean(pairwise_distances(query, X, metric='cosine')) neighbors = lshf.radius_neighbors(query, radius=mean_dist, return_distance=False) # At least one neighbor should be returned. assert_greater(neighbors.shape[0], 0) # All distances should be less than mean_dist distances, neighbors = lshf.radius_neighbors(query, radius=mean_dist, return_distance=True) assert_array_less(distances[0], mean_dist) # Multiple points n_queries = 5 queries = X[rng.randint(0, n_samples, n_queries)] distances, neighbors = lshf.radius_neighbors(queries, return_distance=True) assert_equal(neighbors.shape[0], n_queries) assert_equal(distances.shape[0], n_queries) # dists and inds should not be 2D arrays assert_equal(distances.ndim, 1) assert_equal(neighbors.ndim, 1) # Compare with exact neighbor search query = X[rng.randint(0, n_samples)] mean_dist = np.mean(pairwise_distances(query, X, metric='cosine')) nbrs = NearestNeighbors(algorithm='brute', metric='cosine') nbrs.fit(X) distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist) distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist) # Distances of exact neighbors is less than or equal to approximate assert_true(np.all(np.less_equal(np.sort(distances_exact[0]), np.sort(distances_approx[0]))))