def test_kd_tree_KDE(n_samples=100, n_features=3): np.random.seed(0) X = np.random.random((n_samples, n_features)) Y = np.random.random((n_samples, n_features)) kdt = KDTree(X, leaf_size=10) for kernel in [ 'gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear', 'cosine' ]: for h in [0.001, 0.01, 0.1]: dens_true = compute_kernel_slow(Y, X, kernel, h) def check_results(kernel, h, atol, rtol, dualtree, breadth_first): dens = kdt.kernel_density(Y, h, atol=atol, rtol=rtol, kernel=kernel, dualtree=dualtree, breadth_first=breadth_first) assert_allclose(dens, dens_true, atol=atol, rtol=rtol) for rtol in [0, 1E-5]: for atol in [1E-10, 1E-5, 0.1]: for dualtree in (True, False): if dualtree and rtol > 0: continue for breadth_first in (True, False): yield (check_results, kernel, h, atol, rtol, dualtree, breadth_first)
def fit(self, points): """聚类主函数 聚类过程主要是通过expand_cluste_order函数实现,流程如下: 给定一个起始点pt,计算pt的core_distance和eps-邻域,并更新eps-邻域中数据点的 reachability_distance 然后按reachability_distance从小到大依次处理pt的eps-邻域中未处理的点(流程同上) 遍历整个数据集,对每个未expand的数据点都执行expand,便完成了聚类,结果存储在self.results中 数据点遍历顺序存储在self.results_order中,二者结合便可以导出具体的聚类信息 :param points: [list] 输入数据列表,list中的每个元素都是长度固定的1维np数组 :returns: :rtype: """ """ results[遍历标记,核心距离,可达距离] results_order 存放数据遍历顺序 """ self.point_num = len(points) self.point_size = points[0].size self.results = [[None, np.inf, np.inf] for x in range(self.point_num)] self.results_order = [] ## 数据存储在kd树中以便检索【好像并没有用到检索...】 self.kd_tree = KDTree(self.point_size) self.kd_tree.create(points) for point_id in range(self.point_num): ## 如果当前节点没有处理过,执行expand if not self.results[point_id][0]: self._expand_cluste_order(point_id) return self
def test_kd_buildup(points: List[Point]) -> float: tracemalloc.start() starting_mem, _ = tracemalloc.get_traced_memory() tree = KDTree(points) current, peak = tracemalloc.get_traced_memory() tracemalloc.stop() return peak - starting_mem
def fit(self, X, y): self.label_set = set(y) data = list() for con, lab in it.izip(X, y): if not self.dense: con = con.toarray() con = con[0] data.append((con, lab)) # Create a KDTree using the data given and store it self.data = KDTree(data, self.k_neighbours, balanced=self.balanced)
def check_neighbors(dualtree, breadth_first, k, metric, kwargs): kdt = KDTree(X, leaf_size=1, metric=metric, **kwargs) dist1, ind1 = kdt.query(Y, k, dualtree=dualtree, breadth_first=breadth_first) dist2, ind2 = brute_force_neighbors(X, Y, k, metric, **kwargs) # don't check indices here: if there are any duplicate distances, # the indices may not match. Distances should not have this problem. assert_allclose(dist1, dist2)
def petrol_bunk(): points = [] c = 0 infile = open('petrol_bunk.txt', 'r') for i in infile: points.append([]) points[c].append((list(map(float, i.rstrip().split(","))))) points[c].append(c) c += 1 return KDTree(points), points
def test_kd_search(points: List[Point], rectangles: List[Rectangle]) -> List[float]: tree = KDTree(points) def time_individual(rectangle: Rectangle) -> float: min_x, max_x, min_y, max_y = rectangle.to_tuple() start_time = default_timer() tree.search(min_x, max_x, min_y, max_y) end_time = default_timer() return end_time - start_time return list(map(time_individual, rectangles))
def hospitals(): points = [] c = 0 infile = open('hospitals.txt', 'r') for i in infile: points.append([]) points[c].append((list(map(float, i.rstrip().split(","))))) points[c].append(c) c += 1 return KDTree(points), points
def test_kd_tree_pickle(): import pickle np.random.seed(0) X = np.random.random((10, 3)) kdt1 = KDTree(X, leaf_size=1) ind1, dist1 = kdt1.query(X) def check_pickle_protocol(protocol): s = pickle.dumps(kdt1, protocol=protocol) kdt2 = pickle.loads(s) ind2, dist2 = kdt2.query(X) assert_allclose(ind1, ind2) assert_allclose(dist1, dist2) for protocol in (0, 1, 2): yield check_pickle_protocol, protocol
def test_kd_tree_two_point(n_samples=100, n_features=3): np.random.seed(0) X = np.random.random((n_samples, n_features)) Y = np.random.random((n_samples, n_features)) r = np.linspace(0, 1, 10) kdt = KDTree(X, leaf_size=10) D = DistanceMetric.get_metric("euclidean").pairwise(Y, X) counts_true = [(D <= ri).sum() for ri in r] def check_two_point(r, dualtree): counts = kdt.two_point_correlation(Y, r=r, dualtree=dualtree) assert_allclose(counts, counts_true) for dualtree in (True, False): yield check_two_point, r, dualtree
def test_random(self): count, sigma1, sigma2 = 10000, 0.6, 0.5 np.random.seed(0) x = np.random.normal(3, sigma1, count) y = np.random.normal(3, sigma2, count) point = [3.01, 3.01] for i in range(count): if 2.98 < x[i] < 3.03 and 2.98 < y[i] < 3.03: ax.scatter(x[i], y[i], c='b', marker='s', s=10, alpha=0.7) # ax.scatter(x, y, c='b', marker='s', s=10, alpha=0.7) points = np.c_[x, y] tree = KDTree(points) show_closest(tree, point, 50, 'm') plt.show()
def test_kd_tree_query_radius(n_samples=100, n_features=10): np.random.seed(0) X = 2 * np.random.random(size=(n_samples, n_features)) - 1 query_pt = np.zeros(n_features, dtype=float) eps = 1E-15 # roundoff error can cause test to fail kdt = KDTree(X, leaf_size=5) rad = np.sqrt(((X - query_pt)**2).sum(1)) for r in np.linspace(rad[0], rad[-1], 100): ind = kdt.query_radius(query_pt, r + eps)[0] i = np.where(rad <= r + eps)[0] ind.sort() i.sort() assert_allclose(i, ind)
def test_kd_tree_query_radius_distance(n_samples=100, n_features=10): np.random.seed(0) X = 2 * np.random.random(size=(n_samples, n_features)) - 1 query_pt = np.zeros(n_features, dtype=float) eps = 1E-15 # roundoff error can cause test to fail kdt = KDTree(X, leaf_size=5) rad = np.sqrt(((X - query_pt)**2).sum(1)) for r in np.linspace(rad[0], rad[-1], 100): ind, dist = kdt.query_radius(query_pt, r + eps, return_distance=True) ind = ind[0] dist = dist[0] d = np.sqrt(((query_pt - X[ind])**2).sum(1)) assert_allclose(d, dist)
def main(): print("Testing KD Tree...") test_times = 100 run_time_1 = run_time_2 = 0 for _ in range(test_times): # 随机生成数据 low = 0 high = 100 n_rows = 1000 n_cols = 2 X = gen_data(low, high, n_rows, n_cols) y = gen_data(low, high, n_rows) Xi = gen_data(low, high, n_cols) # 创建Kd树 tree = KDTree() tree.build_tree(X, y) # Kd树查找 start = time() nd = tree.nearest_neighbour_search(Xi) run_time_1 += time() - start ret1 = get_eu_dist(Xi, nd.split[0]) # 普通线性查找 start = time() row = exhausted_search(X, Xi) run_time_2 += time() - start ret2 = get_eu_dist(Xi, row) # 比较结果 assert ret1 == ret2, "target:%s\nrestult1:%s\nrestult2:%s\ntree:\n%s" % ( Xi, nd, row, tree) print("%d tests passed!" % test_times) print("KD Tree Search %.2f s" % run_time_1) print("Exhausted search %.2f s" % run_time_2)
def test_randoms(self): count, sigma1, sigma2 = 500, 0.6, 0.5 np.random.seed(0) x1 = np.random.normal(5, sigma1, count) y1 = np.random.normal(5, sigma2, count) x2 = np.random.normal(3, sigma1, count) y2 = np.random.normal(4, sigma2, count) x3 = np.random.normal(4.5, sigma1, count) y3 = np.random.normal(2.5, sigma2, count) point = [np.random.normal(5, 0.6), np.random.normal(5, 0.5)] ax.scatter(x1, y1, c='b', marker='s', s=10, alpha=0.7) ax.scatter(x2, y2, c='r', marker='^', s=10, alpha=0.7) ax.scatter(x3, y3, c='g', s=10, alpha=0.7) points = np.c_[np.r_[x1, x2, x3], np.r_[y1, y2, y3]] tree = KDTree(points) show_closest(tree, point, 'm') plt.show()
def test_should_find_n_closest_7(): kdtree = KDTree(dataset_2d) dist, ind = kdtree.query(np.array([7, 1]), n=3) assert ind == [5, 4, 1]
def test_should_find_closest_6(): kdtree = KDTree(dataset_3d) dist, ind = kdtree.query(np.array([8, 8, 8]), n=1) assert ind == [4]
def test_should_find_closest_5(): kdtree = KDTree(dataset_3d) dist, ind = kdtree.query(np.array([4, 5, 6]), n=1) assert ind == [2]
def test_should_find_closest_4(): kdtree = KDTree(dataset_3d) dist, ind = kdtree.query(np.array([1, 2, 3]), n=1) assert ind == [3]
def build_tree(pixels): kdt = KDTree(3) for pix in pixels: kdt.insert(pix) return kdt
def fit(self, training_set, labels): if self.algorithm == 'kd_tree': self.handler = KDTree(training_set, labels) else: self.handler = BruteSearch(training_set, labels)
def test_should_find_closest_1(): kdtree = KDTree(dataset_2d) dist, ind = kdtree.query(np.array([9, 2]), n=1) assert ind == [4]
def test_should_find_n_closest_8(): kdtree = KDTree(dataset_2d) dist, ind = kdtree.query(np.array([3, 2]), n=3) assert ind == [0, 1, 5]
def setUp(self): points = [(2, 3), (5, 4), (9, 6), (4, 7), (8, 1), (7, 2)] self.tree = KDTree(points)
# [3,4,6,1], # [3,6,6,1], # [3,5,9,2], # [3,5,12,2], # [3,5,13,2],] X = [[2, 3, 1], [5, 4, 1], [9, 6, 1], [8.5, 6, 1], [4, 7, 1], [8, 1, 1], [7, 2, 1]] X = np.array(X) # data_train = pd.read_csv('./data_set/iris_1.csv', header=0) # train_data = np.array(data_train) # X = train_data[:, :-1] # y = train_data[:, -1] # X_train, X_test, y_train, y_true = train_test_split(X, y,test_size=1 / 3., random_state=6) # # train_set = np.column_stack((X_train, y_train)) kd = KDTree() kd.build_tree(X) x = [[7, 6, 1], [3, 4.5, 1]] test_x = np.array(x) # print(test_x[:,:-1]) nearest = kd.search_neighbour(test_x) for i in range(len(test_x)): print(test_x[i], '--->', nearest[i])
def test_kd_buildup(points: List[Point]) -> float: start_time = default_timer() _ = KDTree(points) end_time = default_timer() return end_time - start_time
def test_should_find_n_closest_9(): kdtree = KDTree(dataset_3d) dist, ind = kdtree.query(np.array([4, 5, 6]), n=3) assert ind == [2, 3, 1] # ????? падает!
def main(): dataset_2d = np.array([[2, 3], [5, 4], [9, 6], [4, 7], [8, 1], [7, 2]]) kdtree = KDTree(dataset_2d) dist, ind = kdtree.query(np.array([9, 2]), n=1)
def setUp(self): np.random.seed(0) points = np.random.randint(0, 50, 10) self.tree = KDTree(np.transpose([points]))
def __create_kdtree(self): self.__KDTree_T = KDTree() for curr_cluster in self.__heap_q: for rep_point in curr_cluster.rep: self.__KDTree_T.insert(rep_point, curr_cluster)