def test_ball_tree_pickle(): rng = check_random_state(0) X = rng.random_sample((10, 3)) bt1 = BallTree(X, leaf_size=1) # Test if BallTree with callable metric is picklable bt1_pyfunc = BallTree(X, metric=dist_func, leaf_size=1, p=2) ind1, dist1 = bt1.query(X) ind1_pyfunc, dist1_pyfunc = bt1_pyfunc.query(X) def check_pickle_protocol(protocol): s = pickle.dumps(bt1, protocol=protocol) bt2 = pickle.loads(s) s_pyfunc = pickle.dumps(bt1_pyfunc, protocol=protocol) bt2_pyfunc = pickle.loads(s_pyfunc) ind2, dist2 = bt2.query(X) ind2_pyfunc, dist2_pyfunc = bt2_pyfunc.query(X) assert_array_almost_equal(ind1, ind2) assert_array_almost_equal(dist1, dist2) assert_array_almost_equal(ind1_pyfunc, ind2_pyfunc) assert_array_almost_equal(dist1_pyfunc, dist2_pyfunc) assert isinstance(bt2, BallTree) for protocol in (0, 1, 2): check_pickle_protocol(protocol)
def test_ball_tree_pickle(): np.random.seed(0) X = np.random.random((10, 3)) bt1 = BallTree(X, leaf_size=1) # Test if BallTree with callable metric is picklable bt1_pyfunc = BallTree(X, metric=dist_func, leaf_size=1, p=2) ind1, dist1 = bt1.query(X) ind1_pyfunc, dist1_pyfunc = bt1_pyfunc.query(X) def check_pickle_protocol(protocol): s = pickle.dumps(bt1, protocol=protocol) bt2 = pickle.loads(s) s_pyfunc = pickle.dumps(bt1_pyfunc, protocol=protocol) bt2_pyfunc = pickle.loads(s_pyfunc) ind2, dist2 = bt2.query(X) ind2_pyfunc, dist2_pyfunc = bt2_pyfunc.query(X) assert_array_almost_equal(ind1, ind2) assert_array_almost_equal(dist1, dist2) assert_array_almost_equal(ind1_pyfunc, ind2_pyfunc) assert_array_almost_equal(dist1_pyfunc, dist2_pyfunc) for protocol in (0, 1, 2): yield check_pickle_protocol, protocol
def lof(X, k, outlier_threshold=1.5, verbose=False): """Knn with KD trees""" start = time.time() tree = BallTree(X, leaf_size=2) distance, index = tree.query(X, k) distance, index = distance[:, 1:], index[:, 1:] radius = distance[:, -1] """Calculate LRD.""" LRD = np.mean(np.maximum(distance, radius[index]), axis=1) r = 1. / np.array(LRD) """Calculate outlier score.""" outlier_score = np.sum(r[index], axis=1) / np.array(r, dtype=np.float16) outlier_score *= 1. / k # print ('Compute time: %g seconds.' % ((time.time() - start))) if verbose: print("Recording all outliers with outlier score greater than %s." \ % (outlier_threshold)) outliers = [] """ Could parallelize this for loop, but really not worth the overhead... Would get insignificant performance gain.""" for i, score in enumerate(outlier_score): if score > outlier_threshold: outliers.append([i, X[i], score]) if verbose: print("Detected outliers:") print(outliers) return outliers
def similar_products2(deep_f): qs = Product.objects.all() df=read_frame(qs) df['idx'] = range(1, len(df) + 1) feature_list=[] asin_list=[] for prod in qs: feature_list.append(prod.get_features()) asin_list.append(prod.asin) nparray = np.asarray(feature_list) #print nparray tree = BallTree(nparray) dist, ind = tree.query(deep_f, k=5) print ind index = ind[0] recom = index[0:] recommended_asins =[]; for i in recom: recommended_asins.append(asin_list[i]) recommended_prods = Product.objects.filter(asin__in = recommended_asins) return recommended_prods # image_train = graphlab.SFrame(data=df) # cur_prod = image_train[18:19] # print cur_prod # print image_train # knn_model = graphlab.nearest_neighbors.create(image_train, features = ['features'],label = 'asin',distance = 'levenshtein',method = 'ball_tree') # knn_model.save('my_knn') # #knn_model= graphlab.load_model('my_knn') # #print knn_model.query(cur_prod) # #knn_model = graphlab.nearest_neighbors.create(image_train, features = ['features'],label = 'keywords')
def similar_products(product): qs = Product.objects.all() df=read_frame(qs) df['idx'] = range(1, len(df) + 1) feature_list=[] asin_list=[] product_index = 0 inn=0 for prod in qs: feature_list.append(prod.get_features()) asin_list.append(prod.asin) if prod.asin == product.asin: product_index = inn inn+=1 nparray = np.asarray(feature_list) #print nparray tree = BallTree(nparray) dist, ind = tree.query(nparray[product_index], k=5) print ind index = ind[0] recom = index[1:] recommended_asins =[]; for i in recom: recommended_asins.append(asin_list[i]) recommended_prods = Product.objects.filter(asin__in = recommended_asins) return recommended_prods
class ClusterModel: def __init__(self, sen2vec, corpus_path, corpus_vec_path): self.sen2vec = sen2vec self._corpus = pd.read_csv(corpus_path) self._vectors = load_qa_corpus_vec(corpus_vec_path) self._indices = [] X = [] for i, v in enumerate(self._vectors): if any(v): self._indices.append(i) X.append(v) X = np.array(X) # 构建balltree self.tree = BallTree(X) def __call__(self, sentence, k=1): """ 找出与给定句子相似的topk个问题 """ x = self.sen2vec.sentence2vec([sentence]).reshape(1, -1) dist, ind = self.tree.query(x, k=k) res = [] indices = [self._indices[i] for i in ind[0]] for i, e in enumerate(indices): qa = self._corpus.loc[e] res.append((qa['question'], qa['answer'], dist[0][i])) return res
def src_nearest_gst_distance(src_pos, gst_pos, nn=1): """INCLUDES PATH STRETCH""" gst_tree = BallTree(np.deg2rad(gst_pos), metric=DistanceMetric.get_metric("haversine")) src_gst_dist, src_gst_ind = gst_tree.query(np.deg2rad(src_pos), k=nn) src_gst_dist = haversine_to_km(src_gst_dist) src_gst_dist = src_gst_dist * FIBER_PATH_STRETCH return src_gst_ind, src_gst_dist
def check_neighbors(dualtree, breadth_first, k, metric, kwargs): bt = BallTree(X, leaf_size=1, metric=metric, **kwargs) dist1, ind1 = bt.query(Y, k, dualtree=dualtree, breadth_first=breadth_first) dist2, ind2 = brute_force_neighbors(X, Y, k, metric, **kwargs) # don't check indices here: if there are any duplicate distances, # the indices may not match. Distances should not have this problem. assert_array_almost_equal(dist1, dist2)
def test_query_haversine(): rng = check_random_state(0) X = 2 * np.pi * rng.random_sample((40, 2)) bt = BallTree(X, leaf_size=1, metric='haversine') dist1, ind1 = bt.query(X, k=5) dist2, ind2 = brute_force_neighbors(X, X, k=5, metric='haversine') assert_array_almost_equal(dist1, dist2) assert_array_almost_equal(ind1, ind2)
def test_query_haversine(): np.random.seed(0) X = 2 * np.pi * np.random.random((40, 2)) bt = BallTree(X, leaf_size=1, metric='haversine') dist1, ind1 = bt.query(X, k=5) dist2, ind2 = brute_force_neighbors(X, X, k=5, metric='haversine') assert_array_almost_equal(dist1, dist2) assert_array_almost_equal(ind1, ind2)
def test_ball_tree_query_metrics(metric): rng = check_random_state(0) if metric in BOOLEAN_METRICS: X = rng.random_sample((40, 10)).round(0) Y = rng.random_sample((10, 10)).round(0) elif metric in DISCRETE_METRICS: X = (4 * rng.random_sample((40, 10))).round(0) Y = (4 * rng.random_sample((10, 10))).round(0) k = 5 bt = BallTree(X, leaf_size=1, metric=metric) dist1, ind1 = bt.query(Y, k) dist2, ind2 = brute_force_neighbors(X, Y, k, metric) assert_array_almost_equal(dist1, dist2)
def test_ball_tree_query(metric, k, dualtree, breadth_first): rng = check_random_state(0) X = rng.random_sample((40, DIMENSION)) Y = rng.random_sample((10, DIMENSION)) kwargs = METRICS[metric] bt = BallTree(X, leaf_size=1, metric=metric, **kwargs) dist1, ind1 = bt.query(Y, k, dualtree=dualtree, breadth_first=breadth_first) dist2, ind2 = brute_force_neighbors(X, Y, k, metric, **kwargs) # don't check indices here: if there are any duplicate distances, # the indices may not match. Distances should not have this problem. assert_array_almost_equal(dist1, dist2)
def test_ball_tree_pickle(): import pickle np.random.seed(0) X = np.random.random((10, 3)) bt1 = BallTree(X, leaf_size=1) ind1, dist1 = bt1.query(X) def check_pickle_protocol(protocol): s = pickle.dumps(bt1, protocol=protocol) bt2 = pickle.loads(s) ind2, dist2 = bt2.query(X) assert_array_almost_equal(ind1, ind2) assert_array_almost_equal(dist1, dist2) for protocol in (0, 1, 2): yield check_pickle_protocol, protocol
def test_ball_tree_pickle(): import pickle np.random.seed(0) X = np.random.random((10, 3)) bt1 = BallTree(X, leaf_size=1) ind1, dist1 = bt1.query(X) def check_pickle_protocol(protocol): s = pickle.dumps(bt1, protocol=protocol) bt2 = pickle.loads(s) ind2, dist2 = bt2.query(X) assert_allclose(ind1, ind2) assert_allclose(dist1, dist2) for protocol in (0, 1, 2): yield check_pickle_protocol, protocol
def query(self, X: np.ndarray, k: Optional[int] = None) -> np.ndarray: """ Returns the k nearest neighbors. Parameters: X: An array of shape (num_samples, num_features). k: The number of neighbors to return. Returns: An array of shape (num_samples, k) and of type int containing the indices of the k nearest nodes. """ if k is None: k = self._k bt = BallTree(self.nodes, metric="euclidean") dist, ind = bt.query(X, k) return ind
def rank(self, cs, yc, ls, lss): targets = {l: i for (i, l) in enumerate(ls)} # Number of results (lemmas) ranked n_results = len(yc) # Build ball tree model ball_tree = BallTree(yc) rs = ball_tree.query(cs, k=n_results, return_distance=False) rankings = list() for i, (ranking, ls) in enumerate(zip(rs, lss)): lsm = [targets[l] for l in ls] ranking_array = np.array([(1.0 if i in lsm else 0.0) for i in ranking]) rankings.append(ranking_array) return rankings
def _nonlocalmeans_clustered(img, n_small=5, n_components=9, n_neighbors=30, h=10): Nw = (2 * n_small + 1) ** 2 h2 = h * h n_rows, n_cols = img.shape # precompute the coordinate difference for the big patch small_rows, small_cols = np.indices(((2 * n_small + 1), (2 * n_small + 1))) - n_small # put all patches so we can cluster them n_padded = np.pad(img, n_small, mode='reflect') patches = np.zeros((n_rows * n_cols, Nw)) n = 0 for r in range(n_small, n_small + n_rows): for c in range(n_small, n_small + n_cols): window = n_padded[r + small_rows, c + small_cols].flatten() patches[n, :] = window n += 1 transformed = PCA(n_components=n_components).fit_transform(patches) # index the patches into a tree tree = BallTree(transformed, leaf_size=2) print("Denoising") new_img = np.zeros_like(img) for r in range(n_rows): for c in range(n_cols): idx = r * n_cols + c dist, ind = tree.query(transformed[idx], k=n_neighbors) ridx = np.array([(int(i / n_cols), int(i % n_cols)) for i in ind[0, 1:]]) colors = img[ridx[:, 0], ridx[:, 1]] w = np.exp(-dist[0, 1:] / h2) new_img[r, c] = np.sum(w * colors) / np.sum(w) return new_img
def check_neighbors(metric): bt = BallTree(X, leaf_size=1, metric=metric) dist1, ind1 = bt.query(Y, k) dist2, ind2 = brute_force_neighbors(X, Y, k, metric) assert_array_almost_equal(dist1, dist2)
batch_size = 512 X = np.random.random(size=(n_points, d)).astype(np.float32) res = faiss.StandardGpuResources() flat_config = faiss.GpuIndexFlatConfig() flat_config.device = 0 index = faiss.GpuIndexFlatL2(res, d, flat_config) index.add(X) for bi in range(3,10): for ki in range(3, 10): t = time.time() D, I = index.search(X[0:2**bi,:], 2**ki) print 2**bi, 2**ki, int((time.time()-t)*1000) t = time.time() cpu_index = BallTree(X) print("BallTree build time (mins)", int((time.time()-t)/60)) #t = time.time() #D, I = cpu_index.query(X[0:batch_size,:], k) #print int((time.time()-t)*1000) for bi in range(3,10): for ki in range(3, 10): t = time.time() D, I = cpu_index.query(X[0:2**bi,:], 2**ki) print 2**bi, 2**ki, int((time.time()-t)*1000)
df = df.append(pd.Series(i), ignore_index=True) #print(df.shape) df = df[(df.T != 0).all()] #print(df.shape) print('Training model KNN .........') from sklearn.neighbors.ball_tree import BallTree tree = BallTree(df, leaf_size=2) #print(df.shape) # 7839 100 index = np.expand_dims(df.iloc[69,:], axis =0) dist, ind = tree.query(index, k=3) # doctest: +SKIP print(ind) # indices of 3 closest neighbors #[0 3 1] print(dist) # distances to 3 closest neighbors # #[ 0. 0.19662693 0.29473397] v1 = df.iloc[ind[:,0],:] v2 = df.iloc[ind[:,1],:] v3 = df.iloc[ind[:,2],:] V1 = np.array(v1) V2 = np.array(v2) V3 = np.array(v3) for k,v in WandV.items():
#print(len(WandV.values())) #print(WandV.values()) import pandas as pd df = pd.DataFrame() for i in WandV.values(): #print(pd.DataFrame(i)) df = df.append(pd.Series(i), ignore_index=True) #print("temp head",df.head()) #print("temp shape", df.shape) from sklearn.neighbors.ball_tree import BallTree print("KNN ...........") tree = BallTree(df, leaf_size=2) print("finding neighbor words .....") dist, ind = tree.query(df[:1], k=3) # doctest: +SKIP print(ind) # indices of 3 closest neighbors #[0 3 1] print(dist) # distances to 3 closest neighbors #[ 0. 0.19662693 0.29473397] v1 = df.iloc[0, :] v2 = df.iloc[363, :] v3 = df.iloc[3774, :] V1 = np.array(v1) V2 = np.array(v2) V3 = np.array(v3) for k, v in WandV.items(): comparison = v == V1