def test_cluster_between_regions_2(self): """ Tests that DBSCAN can find clusters between regions. """ x = np.array([[0, 0], [0.6, 0], [0.9, 0], [1.1, 0.2], [0.9, 0.6], [1.1, 0.8], [1.4, 0.8], [2, 2]]) ds_x = ds.array(x, block_size=(5, 2)) dbscan = DBSCAN(n_regions=2, eps=0.5, min_samples=3) dbscan.fit(ds_x) self.assertEqual(dbscan.n_clusters, 1)
def test_small_cluster_2(self): """ Tests that DBSCAN can find clusters with less than min_samples. """ x = np.array([[0, 0], [0, 1], [1, 0], [3, 0], [5.1, 0], [6, 0], [6, 1], [10, 10]]) ds_x = ds.array(x, block_size=(5, 2)) # n_regions=10 dbscan2 = DBSCAN(n_regions=10, eps=2.5, min_samples=4) dbscan2.fit(ds_x) self.assertEqual(dbscan2.n_clusters, 2)
def test_cluster_between_regions_1(self): """ Tests that DBSCAN can find clusters between regions. """ x = np.array([[0, 0], [3.9, 0], [4.1, 0], [4.1, 0.89], [4.1, 0.88], [5.9, 0], [5.9, 0.89], [5.9, 0.88], [6.1, 0], [10, 10], [4.6, 0], [5.4, 0]]) ds_x = ds.array(x, block_size=(5, 2)) dbscan = DBSCAN(n_regions=10, eps=0.9, min_samples=4) dbscan.fit(ds_x) self.assertEqual(dbscan.n_clusters, 1)
def test_zero_samples(self): """ Tests DBSCAN fit when some regions contain zero samples. """ n_samples = 2 x, y = make_blobs(n_samples=n_samples, n_features=2, random_state=8) dbscan = DBSCAN(n_regions=3, eps=.2, max_samples=100) x = StandardScaler().fit_transform(x) ds_x = ds.array(x, block_size=(2, 2)) dbscan.fit(ds_x) self.assertEqual(dbscan.n_clusters, 0)
def test_random_clusters_2(self): """ Tests DBSCAN on random data with multiple clusters. """ # 2 dimensions np.random.seed(2) x = np.random.uniform(0, 10, size=(1000, 2)) ds_x = ds.array(x, block_size=(300, 2)) dbscan = DBSCAN(n_regions=10, max_samples=10, eps=0.5, min_samples=10) y = dbscan.fit_predict(ds_x).collect() self.assertEqual(dbscan.n_clusters, 27) self.assertEqual(np.count_nonzero(y == -1), 206)
def test_n_clusters_moons_grid(self): """ Tests that DBSCAN finds the correct number of clusters when setting n_regions > 1 with moon data. """ n_samples = 1500 x, y = make_moons(n_samples=n_samples, noise=.05) dbscan = DBSCAN(n_regions=4, eps=.3, max_samples=600) x = StandardScaler().fit_transform(x) ds_x = ds.array(x, block_size=(300, 2)) dbscan.fit(ds_x) self.assertEqual(dbscan.n_clusters, 2)
def test_n_clusters_blobs_grid(self): """ Tests that DBSCAN finds the correct number of clusters when setting n_regions > 1 with blob data. """ n_samples = 1500 x, y = make_blobs(n_samples=n_samples, n_features=2, random_state=8) dbscan = DBSCAN(n_regions=4, eps=.3, max_samples=300) x = StandardScaler().fit_transform(x) ds_x = ds.array(x, block_size=(300, 2)) dbscan.fit(ds_x) self.assertEqual(dbscan.n_clusters, 3)
def test_n_clusters_circles_max_samples(self): """ Tests that DBSCAN finds the correct number of clusters when defining max_samples with circle data. """ n_samples = 1500 x, y = make_circles(n_samples=n_samples, factor=.5, noise=.05) dbscan = DBSCAN(n_regions=1, eps=.15, max_samples=500) x = StandardScaler().fit_transform(x) ds_x = ds.array(x, block_size=(300, 2)) dbscan.fit(ds_x) self.assertEqual(dbscan.n_clusters, 2)
def test_random_clusters_3(self): """ Tests DBSCAN on random data with multiple clusters. """ # 3 dimensions np.random.seed(3) x = np.random.uniform(0, 10, size=(1000, 3)) ds_x = ds.array(x, block_size=(300, 3)) dbscan = DBSCAN(n_regions=10, dimensions=[0, 1], eps=0.9, min_samples=4) y = dbscan.fit_predict(ds_x).collect() self.assertEqual(dbscan.n_clusters, 50) self.assertEqual(np.count_nonzero(y == -1), 266)
def main(): data = ds.load_txt_file("/gpfs/projects/bsc19/COMPSs_DATASETS/dislib/gaia" "/dbscan/data_scaled.csv", block_size=(10000, 5)) dbscan = DBSCAN(eps=0.19, min_samples=5, max_samples=5000, n_regions=17, dimensions=[0, 1]) performance.measure("DBSCAN", "gaia", dbscan.fit, data)
def test_sparse(self): """ Tests that DBSCAN produces the same results with sparse and dense data. """ n_samples = 1500 x, y = make_blobs(n_samples=n_samples, random_state=170) dbscan = DBSCAN(n_regions=1, eps=.15) transformation = [[0.6, -0.6], [-0.4, 0.8]] x = np.dot(x, transformation) x = StandardScaler().fit_transform(x) dense = ds.array(x, block_size=(300, 2)) sparse = ds.array(csr_matrix(x), block_size=(300, 2)) y_dense = dbscan.fit_predict(dense).collect() y_sparse = dbscan.fit_predict(sparse).collect() self.assertTrue(np.array_equal(y_dense, y_sparse))
def main(): file = "/fefs/scratch/bsc19/bsc19029/PERFORMANCE/datasets/data_scaled.csv" data = ds.load_txt_file(file, block_size=(10000, 5)) dbscan = DBSCAN(eps=0.19, min_samples=5, max_samples=5000, n_regions=17, dimensions=[0, 1]) performance.measure("DBSCAN", "gaia", dbscan.fit, data)
def test_n_clusters_aniso_dimensions(self): """ Tests that DBSCAN finds the correct number of clusters when dimensions is not None. """ n_samples = 1500 x, y = make_blobs(n_samples=n_samples, random_state=170) dbscan = DBSCAN(n_regions=5, dimensions=[1], eps=.15) transformation = [[0.6, -0.6], [-0.4, 0.8]] x = np.dot(x, transformation) x = StandardScaler().fit_transform(x) ds_x = ds.array(x, block_size=(300, 2)) y_pred = dbscan.fit_predict(ds_x).collect() true_sizes = {19, 496, 491, 488, 6} cluster_sizes = { y_pred[y_pred == -1].size, y_pred[y_pred == 0].size, y_pred[y_pred == 1].size, y_pred[y_pred == 2].size, y_pred[y_pred == 3].size } self.assertEqual(dbscan.n_clusters, 4) self.assertEqual(true_sizes, cluster_sizes)
def main(): np.random.seed(0) # ============ # Generate datasets. We choose the size big enough to see the scalability # of the algorithms, but not too big to avoid too long running times # ============ n_samples = 1500 noisy_circles = make_circles(n_samples=n_samples, factor=.5, noise=.05, random_state=170) noisy_moons = make_moons(n_samples=n_samples, noise=.05) blobs = make_blobs(n_samples=n_samples, random_state=8) no_structure = np.random.rand(n_samples, 2), None # Anisotropicly distributed data random_state = 170 X, y = make_blobs(n_samples=n_samples, random_state=random_state) transformation = [[0.6, -0.6], [-0.4, 0.8]] X_aniso = np.dot(X, transformation) aniso = (X_aniso, y) # blobs with varied variances varied = make_blobs(n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state) # ============ # Set up cluster parameters # ============ plt.figure(figsize=(9 * 2 + 3, 12.5)) plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05, hspace=.01) plot_num = 1 default_base = { 'quantile': .3, 'eps': .3, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 3 } datasets = [(noisy_circles, { 'damping': .77, 'preference': -240, 'quantile': .2, 'n_clusters': 2 }), (noisy_moons, { 'damping': .75, 'preference': -220, 'n_clusters': 2 }), (varied, { 'eps': .18, 'n_neighbors': 2 }), (aniso, { 'eps': .15, 'n_neighbors': 2 }), (blobs, {}), (no_structure, {})] for i_dataset, (dataset, algo_params) in enumerate(datasets): # update parameters with dataset-specific values params = default_base.copy() params.update(algo_params) X, y = dataset # normalize dataset for easier parameter selection X = StandardScaler().fit_transform(X) # ============ # Create cluster objects # ============ kmeans = KMeans(n_clusters=params["n_clusters"]) dbscan = DBSCAN(eps=params["eps"], n_regions=1) gm = GaussianMixture(n_components=params["n_clusters"]) clustering_algorithms = (('K-Means', kmeans), ('DBSCAN', dbscan), ('Gaussian mixture', gm)) for name, algorithm in clustering_algorithms: t0 = time.time() # catch warnings related to kneighbors_graph with warnings.catch_warnings(): warnings.filterwarnings("ignore", message="the number of connected " "components of the " "connectivity matrix is [" "0-9]{1,2} > 1. Completing " "it to avoid stopping the " "tree early.", category=UserWarning) warnings.filterwarnings("ignore", message="Graph is not fully " "connected, " "spectral " "embedding may not " "work as " "expected.", category=UserWarning) data = ds.array(X, block_size=(300, 2)) algorithm.fit(data) t1 = time.time() y_pred = algorithm.fit_predict(data).collect() plt.subplot(len(datasets), len(clustering_algorithms), plot_num) if i_dataset == 0: plt.title(name, size=18) colors = np.array( list( islice( cycle([ '#377eb8', '#ff7f00', '#4daf4a', '#f781bf', '#a65628', '#984ea3', '#999999', '#e41a1c', '#dede00' ]), int(max(y_pred) + 1)))) # add black color for outliers (if any) colors = np.append(colors, ["#000000"]) plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred]) plt.xlim(-2.5, 2.5) plt.ylim(-2.5, 2.5) plt.xticks(()) plt.yticks(()) plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'), transform=plt.gca().transAxes, size=15, horizontalalignment='right') plot_num += 1 plt.show()
def initialize(alg_names, args): return [{ 'KMeans': lambda x: KMeans(**get_kmeans_kwargs(x)), 'DBSCAN': lambda x: DBSCAN(**get_dbscan_kwargs(x)), 'GaussianMixture': lambda x: GaussianMixture(**get_gm_kwargs(x)) }[name](args) for name in alg_names]
def main(): parser = argparse.ArgumentParser() parser.add_argument("--svmlight", help="read file in SVMlLight format", action="store_true") parser.add_argument("-dt", "--detailed_times", help="get detailed execution times (read and fit)", action="store_true") parser.add_argument("-e", "--epsilon", metavar="EPSILON", type=float, help="default is 0.5", default=0.5) parser.add_argument("-r", "--regions", metavar="N_REGIONS", type=int, help="number of regions to create", default=1) parser.add_argument("-d", "--dimensions", metavar="DIMENSIONS", type=str, help="comma separated dimensions to use in the grid", required=False) parser.add_argument("-x", "--max_samples", metavar="MAX_SAMPLES", type=int, help="maximum samples to process per task (" "default is 1000)", default=1000) parser.add_argument("-m", "--min_samples", metavar="MIN_SAMPLES", type=int, help="default is 5", default=5) parser.add_argument("-b", "--block_size", metavar="BLOCK_SIZE", type=str, help="two comma separated ints that represent the " "size of the blocks in which to divide the input " "data (default is 100,100)", default="100,100") parser.add_argument("-f", "--features", metavar="N_FEATURES", help="number of features of the input data " "(only for SVMLight files)", type=int, default=None, required=False) parser.add_argument("--dense", help="store data in dense format (only " "for SVMLight files)", action="store_true") parser.add_argument("--labeled", help="the last column of the input file " "represents labels (only for text " "files)", action="store_true") parser.add_argument("train_data", help="input file in CSV or SVMLight format", type=str) args = parser.parse_args() train_data = args.train_data s_time = time.time() read_time = 0 sparse = not args.dense bsize = args.block_size.split(",") block_size = (int(bsize[0]), int(bsize[1])) if args.svmlight: x, y = ds.load_svmlight_file(train_data, block_size, args.features, sparse) else: x = ds.load_txt_file(train_data, block_size) n_features = x.shape[1] if args.labeled and not args.svmlight: x = x[:, :n_features - 1] if args.detailed_times: compss_barrier() read_time = time.time() - s_time s_time = time.time() dims = range(args.features) if args.dimensions: dims = args.dimensions.split(",") dims = np.array(dims, dtype=int) dbscan = DBSCAN(eps=args.epsilon, min_samples=args.min_samples, max_samples=args.max_samples, n_regions=args.regions, dimensions=dims) dbscan.fit(x) compss_barrier() fit_time = time.time() - s_time out = [ dbscan.eps, dbscan.min_samples, dbscan.max_samples, dbscan.n_regions, len(dims), args.part_size, dbscan.n_clusters, read_time, fit_time ] print(out)