def test_dbscan_sklearn_comparison(name, nrows): default_base = {'quantile': .3, 'eps': .5, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 2} n_samples = nrows pat = get_pattern(name, n_samples) params = default_base.copy() params.update(pat[1]) X, y = pat[0] X = StandardScaler().fit_transform(X) cuml_dbscan = cuDBSCAN(eps=params['eps'], min_samples=5) cu_y_pred, cu_n_clusters = fit_predict(cuml_dbscan, 'cuml_DBSCAN', X) if nrows < 500000: dbscan = skDBSCAN(eps=params['eps'], min_samples=5) sk_y_pred, sk_n_clusters = fit_predict(dbscan, 'sk_DBSCAN', X) score = adjusted_rand_score(sk_y_pred, cu_y_pred) assert(score == 1.0)
def test_rand_index_score(name, nrows): default_base = { 'quantile': .3, 'eps': .3, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 3 } pat = get_pattern(name, nrows) params = default_base.copy() params.update(pat[1]) cuml_kmeans = cuml.KMeans(n_clusters=params['n_clusters']) X, y = pat[0] X = StandardScaler().fit_transform(X) cu_y_pred, _ = fit_predict(cuml_kmeans, 'cuml_Kmeans', X) cu_score = cu_ars(y, cu_y_pred) cu_score_using_sk = sk_ars(y, cu_y_pred) assert array_equal(cu_score, cu_score_using_sk)
def test_kmeans_sklearn_comparison(name): default_base = { 'quantile': .3, 'eps': .3, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 3 } pat = get_pattern(name, 10000) params = default_base.copy() params.update(pat[1]) kmeans = cluster.KMeans(n_clusters=params['n_clusters']) cuml_kmeans = cuml.KMeans(n_clusters=params['n_clusters']) X, y = pat[0] X = StandardScaler().fit_transform(X) clustering_algorithms = ( ('sk_Kmeans', kmeans), ('cuml_Kmeans', cuml_kmeans), ) sk_y_pred, _ = fit_predict(clustering_algorithms[0][1], clustering_algorithms[0][0], X) cu_y_pred, _ = fit_predict(clustering_algorithms[1][1], clustering_algorithms[1][0], X) # Noisy circles clusters are rotated in the results, # since we are comparing 2 we just need to compare that both clusters # have approximately the same number of points. if name == 'noisy_circles': assert (np.sum(sk_y_pred) - np.sum(cu_y_pred)) / len(sk_y_pred) < 2e-3 else: assert clusters_equal(sk_y_pred, cu_y_pred, params['n_clusters'])
def test_kmeans_sklearn_comparison(name, nrows): default_base = { 'quantile': .3, 'eps': .3, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 3 } pat = get_pattern(name, nrows) params = default_base.copy() params.update(pat[1]) cuml_kmeans = cuml.KMeans(n_clusters=params['n_clusters']) X, y = pat[0] X = StandardScaler().fit_transform(X) cu_y_pred, _ = fit_predict(cuml_kmeans, 'cuml_Kmeans', X) if nrows < 500000: kmeans = cluster.KMeans(n_clusters=params['n_clusters']) sk_y_pred, _ = fit_predict(kmeans, 'sk_Kmeans', X) # Noisy circles clusters are rotated in the results, # since we are comparing 2 we just need to compare that both clusters # have approximately the same number of points. calculation = (np.sum(sk_y_pred) - np.sum(cu_y_pred)) / len(sk_y_pred) print(cuml_kmeans.score(X), kmeans.score(X)) score_test = (cuml_kmeans.score(X) - kmeans.score(X)) < 2e-3 if name == 'noisy_circles': assert (calculation < 2e-3) and score_test else: assert (clusters_equal(sk_y_pred, cu_y_pred, params['n_clusters'])) and score_test
def test_dbscan_sklearn_comparison(name, use_handle): # Skipping datasets of known discrepancies in PR83 while they are corrected default_base = { 'quantile': .3, 'eps': .3, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 3 } pat = get_pattern(name, 1500) params = default_base.copy() params.update(pat[1]) dbscan = skDBSCAN(eps=params['eps'], min_samples=5) handle, stream = get_handle(use_handle) cuml_dbscan = cuDBSCAN(handle=handle, eps=params['eps'], min_samples=5) X, y = pat[0] X = StandardScaler().fit_transform(X) clustering_algorithms = (('sk_DBSCAN', dbscan), ('cuml_DBSCAN', cuml_dbscan)) sk_y_pred, sk_n_clusters = fit_predict(clustering_algorithms[0][1], clustering_algorithms[0][0], X) cu_y_pred, cu_n_clusters = fit_predict(clustering_algorithms[1][1], clustering_algorithms[1][0], X) cuml_dbscan.handle.sync() assert (sk_n_clusters == cu_n_clusters) clusters_equal(sk_y_pred, cu_y_pred, sk_n_clusters)