def test_kdd_tree_mixed(): stream = RandomTreeGenerator(tree_random_state=1, sample_random_state=1, n_num_features=0) stream.prepare_for_use() X, _ = stream.next_sample(1000) X_test, _ = stream.next_sample(10) # Build tree cat_features = [i for i in range(25)] kdtree = KDTree(X, metric='mixed', return_distance=True, categorical_list=cat_features) # Query tree dist, idx = kdtree.query(X_test, 4) expected_idx = [[123, 234, 707, 654], [688, 429, 216, 627], [463, 970, 566, 399], [18, 895, 640, 996], [396, 612, 897, 232], [328, 54, 138, 569], [253, 501, 82, 273], [38, 146, 752, 923], [946, 808, 271, 363], [951, 111, 708, 5]] expected_dist = [[2, 2, 2, 2], [2, 2, 2, 2], [2, 2, 2, 2], [2, 2, 2, 0], [2, 2, 2, 0], [2, 2, 2, 0], [2, 2, 2, 2], [2, 2, 0, 0], [2, 2, 2, 0], [2, 2, 2, 2]] assert np.alltrue(idx == expected_idx) assert np.allclose(dist, expected_dist) expected_info = 'KDTree: - leaf_size: 40 - metric: mixed - return_distance: True' assert kdtree.get_info() == expected_info assert kdtree.get_class_type() == 'data_structure'
def test_kdd_tree_mixed(): stream = RandomTreeGenerator(tree_random_state=1, sample_random_state=1, n_num_features=0) X, _ = stream.next_sample(1000) X_test, _ = stream.next_sample(10) # Build tree cat_features = [i for i in range(25)] kdtree = KDTree(X, metric='mixed', return_distance=True, categorical_list=cat_features) # Query tree dist, idx = kdtree.query(X_test, 4) expected_idx = [[123, 234, 707, 654], [688, 429, 216, 627], [463, 970, 566, 399], [18, 895, 640, 996], [396, 612, 897, 232], [328, 54, 138, 569], [253, 501, 82, 273], [38, 146, 752, 923], [946, 808, 271, 363], [951, 111, 708, 5]] expected_dist = [[2, 2, 2, 2], [2, 2, 2, 2], [2, 2, 2, 2], [2, 2, 2, 0], [2, 2, 2, 0], [2, 2, 2, 0], [2, 2, 2, 2], [2, 2, 0, 0], [2, 2, 2, 0], [2, 2, 2, 2]] assert np.alltrue(idx == expected_idx) assert np.allclose(dist, expected_dist) expected_info = 'KDTree(categorical_list=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, ' \ '11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24], ' \ 'leaf_size=40, metric=mixed, return_distance=True)' assert kdtree.get_info() == expected_info assert kdtree._estimator_type == 'data_structure'
def test_kdd_tree_euclidean(): stream = RandomTreeGenerator(tree_random_state=1, sample_random_state=1) X, _ = stream.next_sample(1000) X_test, _ = stream.next_sample(10) # Build tree kdtree = KDTree(X, metric='euclidean', return_distance=True) # Query tree dist, idx = kdtree.query(X_test, 4) expected_idx = [[855, 466, 348, 996], [829, 654, 92, 333], [227, 364, 183, 325], [439, 482, 817, 501], [886, 173, 279, 470], [98, 30, 34, 580], [959, 773, 374, 819], [819, 685, 59, 992], [624, 665, 209, 239], [524, 807, 506, 191]] expected_dist = [[1.6366216258724973, 1.631437068636607, 1.5408182139320563, 1.4836054196064452], [1.7839579422032452, 1.7694587302438618, 1.5339920309706585, 1.5228981881653287], [1.6512443805072872, 1.637456923425164, 1.61736766513639, 1.5776532815820448], [1.5843121606184263, 1.571918014408251, 1.5038147281265382, 0.7058569455034059], [2.052148026638031, 2.0157953468214007, 1.8012794130725434, 1.6572756455115591], [1.5844032729792423, 1.5688736638121885, 1.55893121879858, 1.4609657517960262], [1.6819916227667229, 1.6186557774269037, 1.5815309744477162, 1.5720184136312232], [1.7302164693989817, 1.5964713159009083, 1.4897849225874815, 1.1629448414734906], [1.6511813695220574, 1.6454651930288255, 1.5926685577827064, 1.4973008307362947], [1.5982346741983797, 1.5875900895982191, 1.4702209684850878, 1.4676217546305874]] assert np.alltrue(idx == expected_idx) assert np.allclose(dist, expected_dist) expected_info = 'KDTree(categorical_list=None, leaf_size=40, metric=euclidean, return_distance=True)' assert kdtree.get_info() == expected_info assert kdtree._estimator_type == 'data_structure'
def demo(): """ _test_kdtree_compare This demo compares creation and query speed for different kd tree implementations. They are fed with instances from the covtype dataset. Three kd tree implementations are compared: SciPy's KDTree, NumPy's KDTree and scikit-multiflow's KDTree. For each of them the demo will time the construction of the tree on 1000 instances, and then measure the time to query 100 instances. The results are displayed in the terminal. """ warnings.filterwarnings("ignore", ".*Passing 1d.*") stream = FileStream('../data/datasets/covtype.csv', -1, 1) filter = OneHotToCategorical([[10, 11, 12, 13], [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]]) X, y = stream.next_sample(1000) X = filter.transform(X) # print(X) X_find, y = stream.next_sample(100) X_find = filter.transform(X_find) print(X_find[4]) # Normal kdtree start = timer() scipy = spatial.KDTree(X, leafsize=40) end = timer() print("\nScipy KDTree construction time: " + str(end-start)) start = timer() for i in range(10): ind = scipy.query(X_find[i], 8) # print(ind) end = timer() print("Scipy KDTree query time: " + str(end - start)) del scipy # Fast kdtree start = timer() opt = KDTree(X, metric='euclidean', return_distance=True) end = timer() print("\nOptimal KDTree construction time: " + str(end-start)) start = timer() for i in range(100): ind, dist = opt.query(X_find[i], 8) # print(ind) # print(dist) end = timer() print("Optimal KDTree query time: " + str(end - start)) del opt # Sklearn kdtree start = timer() sk = ng.KDTree(X, metric='euclidean') end = timer() print("\nSklearn KDTree construction time: " + str(end-start)) start = timer() for i in range(100): ind, dist = sk.query(np.asarray(X_find[i]).reshape(1, -1), 8, return_distance=True) # print(ind) # print(dist) end = timer() print("Sklearn KDTree query time: " + str(end - start) + "\n") del sk