def test_dbscan_balltree(): # Tests the DBSCAN algorithm with balltree for neighbor calculation. eps = 0.8 min_samples = 10 D = pairwise_distances(X) core_samples, labels = dbscan(D, metric="precomputed", eps=eps, min_samples=min_samples) # number of clusters, ignoring noise if present n_clusters_1 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_1, n_clusters) db = DBSCAN(p=2.0, eps=eps, min_samples=min_samples, algorithm="ball_tree") labels = db.fit(X).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_2, n_clusters) db = DBSCAN(p=2.0, eps=eps, min_samples=min_samples, algorithm="kd_tree") labels = db.fit(X).labels_ n_clusters_3 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_3, n_clusters) db = DBSCAN(p=1.0, eps=eps, min_samples=min_samples, algorithm="ball_tree") labels = db.fit(X).labels_ n_clusters_4 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_4, n_clusters) db = DBSCAN(leaf_size=20, eps=eps, min_samples=min_samples, algorithm="ball_tree") labels = db.fit(X).labels_ n_clusters_5 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_5, n_clusters)
def runDBSCAN(distance_matrix, my_eps, my_min_samples, number_of_threads): db = DBSCAN(eps=my_eps, min_samples=my_min_samples, metric='precomputed', n_jobs=number_of_threads) db.fit(distance_matrix) labels = db.labels_ n_clusters = len(set(labels)) - (1 if -1 in labels else 0) n_noises = list(labels).count(-1) print('Number of clusters' + str(n_clusters)) print('Number of noises' + str(n_noises)) return list(labels)
def test_dbscan_callable(): # Tests the DBSCAN algorithm with a callable metric. # Parameters chosen specifically for this task. # Different eps to other test, because distance is not normalised. eps = 0.8 min_samples = 10 # metric is the function reference, not the string key. metric = distance.euclidean # Compute DBSCAN # parameters chosen for task core_samples, labels = dbscan(X, metric=metric, eps=eps, min_samples=min_samples, algorithm='ball_tree') # number of clusters, ignoring noise if present n_clusters_1 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_1, n_clusters) db = DBSCAN(metric=metric, eps=eps, min_samples=min_samples, algorithm='ball_tree') labels = db.fit(X).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_2, n_clusters)
def test_dbscan_balltree(): # Tests the DBSCAN algorithm with balltree for neighbor calculation. eps = 0.8 min_samples = 10 D = pairwise_distances(X) core_samples, labels = dbscan(D, metric="precomputed", eps=eps, min_samples=min_samples) # number of clusters, ignoring noise if present n_clusters_1 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_1, n_clusters) db = DBSCAN(p=2.0, eps=eps, min_samples=min_samples, algorithm='ball_tree') labels = db.fit(X).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_2, n_clusters) db = DBSCAN(p=2.0, eps=eps, min_samples=min_samples, algorithm='kd_tree') labels = db.fit(X).labels_ n_clusters_3 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_3, n_clusters) db = DBSCAN(p=1.0, eps=eps, min_samples=min_samples, algorithm='ball_tree') labels = db.fit(X).labels_ n_clusters_4 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_4, n_clusters) db = DBSCAN(leaf_size=20, eps=eps, min_samples=min_samples, algorithm='ball_tree') labels = db.fit(X).labels_ n_clusters_5 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_5, n_clusters)
def test_dbscan_feature(): # Tests the DBSCAN algorithm with a feature vector array. # Parameters chosen specifically for this task. # Different eps to other test, because distance is not normalised. eps = 0.8 min_samples = 10 metric = "euclidean" # Compute DBSCAN # parameters chosen for task core_samples, labels = dbscan(X, metric=metric, eps=eps, min_samples=min_samples) # number of clusters, ignoring noise if present n_clusters_1 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_1, n_clusters) db = DBSCAN(metric=metric, eps=eps, min_samples=min_samples) labels = db.fit(X).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_2, n_clusters)
def test_dbscan_similarity(): # Tests the DBSCAN algorithm with a similarity array. # Parameters chosen specifically for this task. eps = 0.15 min_samples = 10 # Compute similarities D = distance.squareform(distance.pdist(X)) D /= np.max(D) # Compute DBSCAN core_samples, labels = dbscan(D, metric="precomputed", eps=eps, min_samples=min_samples) # number of clusters, ignoring noise if present n_clusters_1 = len(set(labels)) - (1 if -1 in labels else 0) assert_equal(n_clusters_1, n_clusters) db = DBSCAN(metric="precomputed", eps=eps, min_samples=min_samples) labels = db.fit(D).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_2, n_clusters)
def test_dbscan_feature(): # Tests the DBSCAN algorithm with a feature vector array. # Parameters chosen specifically for this task. # Different eps to other test, because distance is not normalised. eps = 0.8 min_samples = 10 metric = 'euclidean' # Compute DBSCAN # parameters chosen for task core_samples, labels = dbscan(X, metric=metric, eps=eps, min_samples=min_samples) # number of clusters, ignoring noise if present n_clusters_1 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_1, n_clusters) db = DBSCAN(metric=metric, eps=eps, min_samples=min_samples) labels = db.fit(X).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_2, n_clusters)
def test_dbscan_callable(): # Tests the DBSCAN algorithm with a callable metric. # Parameters chosen specifically for this task. # Different eps to other test, because distance is not normalised. eps = 0.8 min_samples = 10 # metric is the function reference, not the string key. metric = distance.euclidean # Compute DBSCAN # parameters chosen for task core_samples, labels = dbscan(X, metric=metric, eps=eps, min_samples=min_samples, algorithm="ball_tree") # number of clusters, ignoring noise if present n_clusters_1 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_1, n_clusters) db = DBSCAN(metric=metric, eps=eps, min_samples=min_samples, algorithm="ball_tree") labels = db.fit(X).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_2, n_clusters)