def preprocess(prod, C, p, algo, nEst=10, nCand=40, feasibles=None): t0 = time.time() if algo == 'special_case_LSH': print "\tLSH DB Special init..." db = LSHForest(n_estimators=nEst, n_candidates=nCand, n_neighbors=C) elif algo == 'general_case_LSH': print "\tLSH DB General init..." db = LSHForest(n_estimators=nEst, n_candidates=nCand, n_neighbors=1) elif algo == 'special_case_exact': print "\tExact DB Special init..." db = NearestNeighbors(n_neighbors=C, metric='cosine', algorithm='brute') else: print "\tExact DB General init..." db = NearestNeighbors(n_neighbors=1, metric='cosine', algorithm='brute') if ((algo == 'special_case_LSH') | (algo == 'special_case_exact')): U = np.eye(prod) normConst = np.sqrt(2 + np.max(p)**2) ptsTemp = np.concatenate( (U * np.array(p[1:]), U), axis=1) * 1.0 / normConst # print ptsTemp,ptsTemp.shape,1.0/normConst feasibles = [0 for i in range(ptsTemp.shape[0])] #dummy else: normConst = C * np.sqrt(1 + np.max(p)**2) ptsTemp = np.zeros((len(feasibles), 2 * prod)) for idx, feasible in enumerate(feasibles): ptsTemp[idx] = np.concatenate( (np.array(p[1:]) * feasible, feasible)) * 1.0 / normConst #MIPS to NN transformation of all points lastCol = np.linalg.norm(ptsTemp, axis=1)**2 lastCol = np.sqrt(1 - lastCol) pts = np.concatenate((ptsTemp, lastCol.reshape((len(feasibles), 1))), axis=1) # for e,fe in enumerate(feasibles): # print e,np.linalg.norm(p[1:]*feasibles[e]/normConst),np.linalg.norm(pts[e]) db.fit(pts) build_time = time.time() - t0 print "\t\tIndex build time: ", build_time return db, build_time, normConst #,pts
def Classify(nlp, keywords, categories): #keywords - list; categories - dict: {name; vector} counterDict = Counter(keywords) #optimization for keywords duplicates sumVector = numpy.zeros(nlp.vocab.vectors_length) #temp text = ' '.join(keywords) for word, repCount in counterDict.items(): #summurizing words vectors curVect = nlp(word).vector sumVector += (curVect * repCount) vec = nlp(text).vector sim = cosine_similarity(vec, sumVector) print("Sim: " + str(sim)) catArray = numpy.array(list(categories.values())) catKeys = list(categories.keys()) #tree = KDTree(catArray, metric='pyfunc', func=cosine_similarity) #dist, ind = tree.query(sumVector, k=TOP_N_COUNT) #.reshape(-1, 1) print("Creating LSHForest...") lshf = LSHForest(n_candidates=70, n_estimators=30, n_neighbors=TOP_N_COUNT) lshf.fit(catArray) print("LSHForest was created") print("Getting neighbors...") distances, indices = lshf.kneighbors(sumVector.reshape((1, -1))) print("Got neighbors.") for curIndex in numpy.nditer(indices): print("Found category: " + str(catKeys[curIndex])) print("with distance: " + str(distances))
def score(factors): verifyCount = 3 X, y = Sets.trainingSet test_set, databases = Sets.testSet X = FactorizeVectors(X, factors) test_set = FactorizeVectors(test_set, factors) correctionAverage = 0 for i in range(verifyCount): best_predictions = 0 clf = LSHForest(n_estimators = 10, n_candidates = 10) clf.fit(X) correct = 0 total = 0 for j in range(len(test_set)): total += 1 actual = databases[j] distances, indices = clf.kneighbors(test_set[j], n_neighbors=5) predicted = GetPrediction(y, distances[0], indices[0]) if (actual == predicted): correct += 1 if (correct > best_predictions): best_predictions = correct correctionAverage += best_predictions correctionAverage = float(correctionAverage)/verifyCount return correctionAverage
def __init__(self, lsh_init=None): if lsh_init == None: self._lsh_forest = LSHForest(n_estimators=25, n_candidates=1000) else: self._lsh_forest = lsh_init self.iw = None self.m = None
def fit_model(self, data, n_estimators, n_neighbours): LSHf = LSHForest(random_state=42, n_estimators=n_estimators, n_neighbors=n_neighbours) LSHf.fit(data) return LSHf
def persist_attraction_similarities_to_db(): # build LSHForest model for reduced dimension dataset svd = TruncatedSVD(n_components=10, n_iter=7) red_dim_itemuserdf = svd.fit_transform(itemuserdf) item_user_model = LSHForest() item_user_model.fit(red_dim_itemuserdf) # persist attractions similarities to db K=20 # query for K neighbors k=10 # return k neighbors for i in range(itemuserdf.shape[0]): distance, indices = item_user_model.kneighbors( red_dim_itemuserdf[i].reshape(1, -1), n_neighbors=K ) weights = 1 - distance for j in range(k): if i != indices[0][j]: e = SimilarAttractions( attraction_id=Attraction.objects.filter( app_id=int(i)).values('attraction_id')[0]['attraction_id'], similar_attraction_id=Attraction.objects.filter( app_id=int(indices[0][j])).values('attraction_id')[0]['attraction_id'], similarity=weights[0][j], ts=timezone.now() ) e.save()
def train(): # 构建匹配语料库 398872 samples sku_names_texts = get_train_datas() sku_names_jieba = get_text_jieba(sku_names_texts) sku_names_with_spaces = [] for sku_names in sku_names_jieba: sku_names_with_spaces.append(' '.join(sku_names)) # 测试数据 1000 samples keywords_texts = get_test_datas() keywords_jieba = get_text_jieba(keywords_texts) keywords_with_spaces = [] for keywords in keywords_jieba: keywords_with_spaces.append(' '.join(keywords)) tfidf_vec = TfidfVectorizer(min_df=3, max_features=None, ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1) x_train = tfidf_vec.fit_transform(sku_names_with_spaces) lshf = LSHForest(random_state=42) #lshf.fit(np.array(x_train)) lshf.fit(x_train) for i, kw in enumerate(keywords_with_spaces): x_test = tfidf_vec.transform([kw]) distances, indices = lshf.kneighbors(x_test.toarray(), n_neighbors=1) idx = indices[0][0] print(i, "||", keywords_texts[i], "||", sku_names_texts[idx]) with open("result/lsh_v1_results.txt", 'a', encoding='utf8') as wf: wf.write(str(i) + "||" + keywords_texts[i] + "||" + sku_names_texts[idx] + "\n")
def fit_lsh(self): self.lsh = LSHForest(random_state=12345) train_data = [ self.encode_sentence(self.indexed_background[i], True) for i in range(len(self.indexed_background)) ] self.lsh.fit(train_data)
def get_nearest_neighbor_iterable(self, graphlist, start_graphs, start_is_subset=True): # vectorize all graphlist = list(graphlist) graphlist_ = copy.deepcopy(graphlist) X = self.vectorizer.transform_single(graphlist_) start_graphs = list(start_graphs) graphlist_ = copy.deepcopy(start_graphs) Y = self.vectorizer.transform_single(graphlist_) forest = LSHForest() forest.fit(X) #http://scikit-learn.org/stable/modules/neighbors.html distances, indices = forest.kneighbors(Y, n_neighbors=2) # we just assume that this is short... index = 0 if start_is_subset: index += 1 #matches= ( X_index ,Y_index, distance ) matches = [(indices[i, index], i, distances[i, index]) for i in range(len(indices))] matches.sort() # this looks super confusing.... #for index, graph in enumerate(selection_iterator(graphlist, [a[0] for a in matches])): # yield ((graph, start_graphs[matches[index][1]], X[matches[index][0]])) # so i wrote this:,,, you may even get rid of the matches variable i think.. and use indices directly for Xi, Yi, dist in matches: yield ((start_graphs[Yi], graphlist[Xi], X[Xi]))
def predict(login, file): login_features = mfcc(login, file) lshf = LSHForest(random_state=42) gmm = joblib.load(path + '/speaker_models/' + login + '.pkl') ubm = joblib.load(path + '/speaker_models/' + 'ubm.pkl') model = joblib.load(path + '/speaker_models/' + login + 'Model.pkl') gmm_likelihood_score = gmm.score(login_features) ubm_likelihood_score = ubm.score(login_features) likelihood_score = gmm_likelihood_score - ubm_likelihood_score login_features = [j for i in login_features for j in i] if len(model) > len(login_features): array = model[:len(login_features)] lshf.fit([array]) distances, indices = lshf.kneighbors([login_features], n_neighbors=2) dist = pairwise_distances_argmin_min([array], [login_features]) else: array = login_features[:len(model)] lshf.fit([array]) distances, indices = lshf.kneighbors([model], n_neighbors=2) dist = pairwise_distances_argmin_min([array], [model]) result = {} result['score'] = [likelihood_score, distances] result['distance'] = dist if likelihood_score > 0: result['Message'] = 'Authenticated' else: result['Message'] = 'Not Authenticated' return result
def optimise(self, num_train_points, num_val_points, parameters): max_accuracy = -1 optimal_estimators = -1 optimal_n_neighbours = -1 for item in self.get_generator(parameters): LSHf = LSHForest(random_state=42, n_estimators=item['n_est'], n_neighbors=item['n_neigh']) LSHf.fit(self.train.images[:num_train_points]) distances, indices = LSHf.kneighbors( self.validation.images[:num_val_points], n_neighbors=5) accuracy, positions = self.model_accuracy(indices, is_optimising=True) if accuracy > max_accuracy: max_accuracy = accuracy optimal_estimators = item['n_est'] optimal_n_neighbours = item['n_neigh'] # print(optimal_n_neighbours_predict) return max_accuracy, optimal_estimators, optimal_n_neighbours
def single_batch(self, tweets): """Performs an approximate nearest neighbors search on tweets in the database passed to it. The database must be a list of tweets (text of the tweets only). Returns the indices of tweets with nearby neighbors (i.e. spam tweets). These indices correspond to indices within the batch of tweets fed to this function.""" # Vectorize and fit tree: vect2 = CountVectorizer(stop_words=self.common_twitter_handles) X2 = vect2.fit_transform(tweets) tree2 = LSHForest() tree2.fit(X2) # Build tree: n_neighbors = [] neighbors_indices = [] for x in vect2.transform(tweets): if len(n_neighbors) % 100 == 0: print "%r tweets analyzed out of %r for this batch" % ( len(n_neighbors), self.batch_size) neighbors = tree2.radius_neighbors(x, radius=.4)[1] n_neighbors.append(len(neighbors[0])) neighbors_indices.append(neighbors) neighbors_indices = [ x for x in range(len(neighbors_indices)) if len(neighbors_indices[x][0]) > 2 ] return neighbors_indices
def __init__(self, use_lsh_forest=False, n_neighbors=20, max_iterations=300, count_concepts=False, number_of_concepts=0, count_terms=False, training_validation_split=0.8, algorithm_id='7', l2r_metric="ERR@k", n_jobs=1, translation_probability=False, **kwargs): self.n_neighbors = n_neighbors nn = LSHForest(n_neighbors=n_neighbors, ** kwargs) if use_lsh_forest else NearestNeighbors( n_neighbors=n_neighbors, **kwargs) self.knn = BatchKNeighbors(nn) self.y = None self.max_iterations = max_iterations self.count_concepts = count_concepts self.count_terms = count_terms self.number_of_concepts = number_of_concepts self.training_validation_split = training_validation_split self.algorithm_id = algorithm_id self.l2r_metric = l2r_metric self.n_jobs = n_jobs self.translation_probability = translation_probability
def test_fit(): """Checks whether `fit` method sets all attribute values correctly.""" n_samples = 12 n_features = 2 n_estimators = 5 rng = np.random.RandomState(42) X = rng.rand(n_samples, n_features) lshf = LSHForest(n_estimators=n_estimators) lshf.fit(X) # _input_array = X assert_array_equal(X, lshf._fit_X) # A hash function g(p) for each tree assert_equal(n_estimators, len(lshf.hash_functions_)) # Hash length = 32 assert_equal(32, lshf.hash_functions_[0].components_.shape[0]) # Number of trees_ in the forest assert_equal(n_estimators, len(lshf.trees_)) # Each tree has entries for every data point assert_equal(n_samples, len(lshf.trees_[0])) # Original indices after sorting the hashes assert_equal(n_estimators, len(lshf.original_indices_)) # Each set of original indices in a tree has entries for every data point assert_equal(n_samples, len(lshf.original_indices_[0]))
def test_partial_fit(): """Checks whether inserting array is consitent with fitted data. `partial_fit` method should set all attribute values correctly. """ n_samples = 12 n_samples_partial_fit = 3 n_features = 2 rng = np.random.RandomState(42) X = rng.rand(n_samples, n_features) X_partial_fit = rng.rand(n_samples_partial_fit, n_features) lshf = LSHForest() # Test unfitted estimator lshf.partial_fit(X) assert_array_equal(X, lshf._fit_X) lshf.fit(X) # Insert wrong dimension assert_raises(ValueError, lshf.partial_fit, np.random.randn(n_samples_partial_fit, n_features - 1)) lshf.partial_fit(X_partial_fit) # size of _input_array = samples + 1 after insertion assert_equal(lshf._fit_X.shape[0], n_samples + n_samples_partial_fit) # size of original_indices_[1] = samples + 1 assert_equal(len(lshf.original_indices_[0]), n_samples + n_samples_partial_fit) # size of trees_[1] = samples + 1 assert_equal(len(lshf.trees_[1]), n_samples + n_samples_partial_fit)
def test_neighbors_accuracy_with_n_estimators(): """Checks whether accuracy increases as `n_estimators` increases.""" n_estimators = np.array([1, 10, 100]) n_samples = 100 n_features = 10 n_iter = 10 n_points = 5 rng = np.random.RandomState(42) accuracies = np.zeros(n_estimators.shape[0], dtype=float) X = rng.rand(n_samples, n_features) for i, t in enumerate(n_estimators): lshf = LSHForest(n_candidates=500, n_estimators=t) lshf.fit(X) for j in range(n_iter): query = X[rng.randint(0, n_samples)] neighbors = lshf.kneighbors(query, n_neighbors=n_points, return_distance=False) distances = pairwise_distances(query, X, metric='cosine') ranks = np.argsort(distances)[0, :n_points] intersection = np.intersect1d(ranks, neighbors).shape[0] ratio = intersection / float(n_points) accuracies[i] = accuracies[i] + ratio accuracies[i] = accuracies[i] / float(n_iter) # Sorted accuracies should be equal to original accuracies assert_true(np.all(np.diff(accuracies) >= 0), msg="Accuracies are not non-decreasing.") # Highest accuracy should be strictly greater than the lowest assert_true(np.ptp(accuracies) > 0, msg="Highest accuracy is not strictly greater than lowest.")
def knn_indices_func_approx( rep_pts: FloatTensor, # (N, pts, dim) pts: FloatTensor, # (N, x, dim) K: int, D: int) -> LongTensor: # (N, pts, K) """ Approximate CPU-based Indexing function based on K-Nearest Neighbors search. :param rep_pts: Representative points. :param pts: Point cloud to get indices from. :param K: Number of nearest neighbors to collect. :param D: "Spread" of neighboring points. :return: Array of indices, P_idx, into pts such that pts[n][P_idx[n],:] is the set k-nearest neighbors for the representative points in pts[n]. """ if rep_pts.is_cuda: rep_pts = rep_pts.cpu() if pts.is_cuda: pts = pts.cpu() rep_pts = rep_pts.data.numpy() pts = pts.data.numpy() region_idx = [] for n, p in enumerate(rep_pts): P_particular = pts[n] lshf = LSHForest(n_estimators=20, n_candidates=100, n_neighbors=D * K + 1) lshf.fit(P_particular) indices = lshf.kneighbors(p, return_distance=False) region_idx.append(indices[:, 1::D])
def runForestLSHSizeAnalysis(argsdict, data, inlbl, fPath, fName, fileN, i): start = time.time() tree = LSHForest(random_state=42) tree.fit(data) end = time.time() return sys.getsizeof(tree), (end - start)
def __init__(self, params: Dict[str, Any]): # Location of corpus to use for background knowledge search. This corpus is assumed to be # gzipped, one sentence per line. self.corpus_path = params.pop('corpus_path', None) # Number of background sentences to collect for each input. self.num_background = params.pop('num_background', 10) # Wait this many epochs before running differentiable search. This lets you train with the # base memory network code using external background knowledge for a time, then, once the # encoder is trained sufficiently, you can turn on the differentiable search. self.num_epochs_delay = params.pop('num_epochs_delay', 10) # Number of epochs we wait in between re-encoding the corpus. # TODO(matt): consider only re-encoding at early stopping, instead of a # number-of-epoch-based parameter. self.num_epochs_per_encoding = params.pop('num_epochs_per_encoding', 2) # Only meaningful if you are loading a model. When loading, should we load a pickled LSH, # or should we re-initialize the LSH from the input corpus? Note that if you give a corpus # path, and you load a saved LSH that was constructed from a _different_ corpus, you could # end up with really weird behavior. self.load_saved_lsh = params.pop('load_saved_lsh', False) # Now that we've popped our parameters, we can call the superclass constructor. super(DifferentiableSearchMemoryNetwork, self).__init__(params) # And then set some member variables. self._sentence_encoder_model = self.__build_sentence_encoder_model() self.lsh = LSHForest(random_state=12345) self.instance_index = {} # type: Dict[int, str]
def CreateAndconfigureLSHForest(categories): # categories - dict: {name; vector} print("Creating LSHForest...") catArray = numpy.array(list(categories.values())) lshf = LSHForest(n_candidates=70, n_estimators=30, n_neighbors=TOP_N_COUNT) lshf.fit(catArray) print("LSHForest was created") return lshf
def lof(X, k, outlier_threshold=1.5, verbose=False): """Knn with KD trees""" start = time.time() lshf = LSHForest(random_state=42) lshf.fit(X) distance, index= lshf.kneighbors(X,n_neighbors=k) distance, index = distance[:, 1:], index[:, 1:] radius = distance[:, -1] """Calculate LRD.""" LRD = np.mean(np.maximum(distance, radius[index]), axis=1) r = 1. / np.array(LRD) """Calculate outlier score.""" outlier_score = np.sum(r[index], axis=1) / np.array(r, dtype=np.float16) outlier_score *= 1. / k # print ('Compute time: %g seconds.' % ((time.time() - start))) if verbose: print("Recording all outliers with outlier score greater than %s." \ % (outlier_threshold)) outliers = [] """ Could parallelize this for loop, but really not worth the overhead... Would get insignificant performance gain.""" for i, score in enumerate(outlier_score): if score > outlier_threshold: outliers.append([i,X[i], score]) if verbose: print("Detected outliers:") print(outliers) return outliers
def test_hash_functions(): """Checks randomness of hash functions. Variance and mean of each hash function (projection vector) should be different from flattened array of hash functions. If hash functions are not randomly built (seeded with same value), variances and means of all functions are equal. """ n_samples = 12 n_features = 2 n_estimators = 5 rng = np.random.RandomState(42) X = rng.rand(n_samples, n_features) lshf = LSHForest(n_estimators=n_estimators, random_state=rng.randint(0, np.iinfo(np.int32).max)) lshf.fit(X) hash_functions = [] for i in range(n_estimators): hash_functions.append(lshf.hash_functions_[i].components_) for i in range(n_estimators): assert_not_equal(np.var(hash_functions), np.var(lshf.hash_functions_[i].components_)) for i in range(n_estimators): assert_not_equal(np.mean(hash_functions), np.mean(lshf.hash_functions_[i].components_))
def test_distances(): """Checks whether returned neighbors are from closest to farthest.""" n_samples = 12 n_features = 2 n_iter = 10 rng = np.random.RandomState(42) X = rng.rand(n_samples, n_features) lshf = LSHForest() lshf.fit(X) for i in range(n_iter): n_neighbors = rng.randint(0, n_samples) query = X[rng.randint(0, n_samples)] distances, neighbors = lshf.kneighbors(query, n_neighbors=n_neighbors, return_distance=True) # Returned neighbors should be from closest to farthest. assert_true(np.all(np.diff(distances[0]) >= 0)) mean_dist = np.mean(pairwise_distances(query, X, metric='cosine')) distances, neighbors = lshf.radius_neighbors(query, radius=mean_dist, return_distance=True) assert_true(np.all(np.diff(distances[0]) >= 0))
def hash_movie_similarity(um, num_neighbors=6): lsh = LSHForest(random_state=470957) lsh.fit(um.T) # Don't compare to self, remove first column, call 7 neighbors dist, ind = lsh.kneighbors(um.T, n_neighbors=num_neighbors+1, return_distance=True) sim = 1 - dist return sim[:,1:], ind[:,1:]
def _fit(self, xs): """ Fit index :param samples: list of Samples :return: """ self.index = LSHForest( n_estimators=self.parameters.get('n_estimators', 20)) self.index.fit(xs)
def test_radius_neighbors_boundary_handling(): X = [[0.999, 0.001], [0.5, 0.5], [0, 1.], [-1., 0.001]] n_points = len(X) # Build an exact nearest neighbors model as reference model to ensure # consistency between exact and approximate methods nnbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X) # Build a LSHForest model with hyperparameter values that always guarantee # exact results on this toy dataset. lsfh = LSHForest(min_hash_match=0, n_candidates=n_points, random_state=42).fit(X) # define a query aligned with the first axis query = [[1., 0.]] # Compute the exact cosine distances of the query to the four points of # the dataset dists = pairwise_distances(query, X, metric='cosine').ravel() # The first point is almost aligned with the query (very small angle), # the cosine distance should therefore be almost null: assert_almost_equal(dists[0], 0, decimal=5) # The second point form an angle of 45 degrees to the query vector assert_almost_equal(dists[1], 1 - np.cos(np.pi / 4)) # The third point is orthogonal from the query vector hence at a distance # exactly one: assert_almost_equal(dists[2], 1) # The last point is almost colinear but with opposite sign to the query # therefore it has a cosine 'distance' very close to the maximum possible # value of 2. assert_almost_equal(dists[3], 2, decimal=5) # If we query with a radius of one, all the samples except the last sample # should be included in the results. This means that the third sample # is lying on the boundary of the radius query: exact_dists, exact_idx = nnbrs.radius_neighbors(query, radius=1) approx_dists, approx_idx = lsfh.radius_neighbors(query, radius=1) assert_array_equal(np.sort(exact_idx[0]), [0, 1, 2]) assert_array_equal(np.sort(approx_idx[0]), [0, 1, 2]) assert_array_almost_equal(np.sort(exact_dists[0]), dists[:-1]) assert_array_almost_equal(np.sort(approx_dists[0]), dists[:-1]) # If we perform the same query with a slightly lower radius, the third # point of the dataset that lay on the boundary of the previous query # is now rejected: eps = np.finfo(np.float64).eps exact_dists, exact_idx = nnbrs.radius_neighbors(query, radius=1 - eps) approx_dists, approx_idx = lsfh.radius_neighbors(query, radius=1 - eps) assert_array_equal(np.sort(exact_idx[0]), [0, 1]) assert_array_equal(np.sort(approx_idx[0]), [0, 1]) assert_array_almost_equal(np.sort(exact_dists[0]), dists[:-2]) assert_array_almost_equal(np.sort(approx_dists[0]), dists[:-2])
def test_radius_neighbors(): """Checks whether Returned distances are less than `radius` At least one point should be returned when the `radius` is set to mean distance from the considering point to other points in the database. Moreover, this test compares the radius neighbors of LSHForest with the `sklearn.neighbors.NearestNeighbors`. """ n_samples = 12 n_features = 2 n_iter = 10 rng = np.random.RandomState(42) X = rng.rand(n_samples, n_features) lshf = LSHForest() # Test unfitted estimator assert_raises(ValueError, lshf.radius_neighbors, X[0]) lshf.fit(X) for i in range(n_iter): query = X[rng.randint(0, n_samples)] mean_dist = np.mean(pairwise_distances(query, X, metric='cosine')) neighbors = lshf.radius_neighbors(query, radius=mean_dist, return_distance=False) # At least one neighbor should be returned. assert_greater(neighbors.shape[0], 0) # All distances should be less than mean_dist distances, neighbors = lshf.radius_neighbors(query, radius=mean_dist, return_distance=True) assert_array_less(distances[0], mean_dist) # Multiple points n_queries = 5 queries = X[rng.randint(0, n_samples, n_queries)] distances, neighbors = lshf.radius_neighbors(queries, return_distance=True) assert_equal(neighbors.shape[0], n_queries) assert_equal(distances.shape[0], n_queries) # dists and inds should not be 2D arrays assert_equal(distances.ndim, 1) assert_equal(neighbors.ndim, 1) # Compare with exact neighbor search query = X[rng.randint(0, n_samples)] mean_dist = np.mean(pairwise_distances(query, X, metric='cosine')) nbrs = NearestNeighbors(algorithm='brute', metric='cosine') nbrs.fit(X) distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist) distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist) # Distances of exact neighbors is less than or equal to approximate assert_true( np.all( np.less_equal(np.sort(distances_exact[0]), np.sort(distances_approx[0]))))
def test_candidates(): """Checks whether candidates are sufficient. This should handle the cases when number of candidates is 0. User should be warned when number of candidates is less than requested number of neighbors. """ X_train = np.array( [[5, 5, 2], [21, 5, 5], [1, 1, 1], [8, 9, 1], [6, 10, 2]], dtype=np.float32) X_test = np.array([7, 10, 3], dtype=np.float32) # For zero candidates lshf = LSHForest(min_hash_match=32) lshf.fit(X_train) message = ("Number of candidates is not sufficient to retrieve" " %i neighbors with" " min_hash_match = %i. Candidates are filled up" " uniformly from unselected" " indices." % (3, 32)) assert_warns_message(UserWarning, message, lshf.kneighbors, X_test, n_neighbors=3) distances, neighbors = lshf.kneighbors(X_test, n_neighbors=3) assert_equal(distances.shape[1], 3) # For candidates less than n_neighbors lshf = LSHForest(min_hash_match=31) lshf.fit(X_train) message = ("Number of candidates is not sufficient to retrieve" " %i neighbors with" " min_hash_match = %i. Candidates are filled up" " uniformly from unselected" " indices." % (5, 31)) assert_warns_message(UserWarning, message, lshf.kneighbors, X_test, n_neighbors=5) distances, neighbors = lshf.kneighbors(X_test, n_neighbors=5) assert_equal(distances.shape[1], 5)
def BuildModel(self, data, labels): # Create and train the classifier. lshf = LSHForest(n_estimators = self.n_estimators, min_hash_match = self.min_hash_match, n_candidates = self.n_candidates, radius_cutoff_ratio = self.radius_cutoff_ratio, radius = self.radius, n_neighbors = self.n_neighbors) lshf.fit(data) return lshf
def test_real_model(self): """ Test that model name works for sklearn estimators """ model1 = LassoCV() model2 = LSHForest() model3 = KMeans() self.assertEqual(get_model_name(model1), 'LassoCV') self.assertEqual(get_model_name(model2), 'LSHForest') self.assertEqual(get_model_name(model3), 'KMeans')