def test_cluster_between_regions_2(self): """ Tests that DBSCAN can find clusters between regions. """ x = np.array([[0, 0], [0.6, 0], [0.9, 0], [1.1, 0.2], [0.9, 0.6], [1.1, 0.8], [1.4, 0.8], [2, 2]]) ds_x = ds.array(x, block_size=(5, 2)) dbscan = DBSCAN(n_regions=2, eps=0.5, min_samples=3) dbscan.fit(ds_x) self.assertEqual(dbscan.n_clusters, 1)
def test_small_cluster_2(self): """ Tests that DBSCAN can find clusters with less than min_samples. """ x = np.array([[0, 0], [0, 1], [1, 0], [3, 0], [5.1, 0], [6, 0], [6, 1], [10, 10]]) ds_x = ds.array(x, block_size=(5, 2)) # n_regions=10 dbscan2 = DBSCAN(n_regions=10, eps=2.5, min_samples=4) dbscan2.fit(ds_x) self.assertEqual(dbscan2.n_clusters, 2)
def test_cluster_between_regions_1(self): """ Tests that DBSCAN can find clusters between regions. """ x = np.array([[0, 0], [3.9, 0], [4.1, 0], [4.1, 0.89], [4.1, 0.88], [5.9, 0], [5.9, 0.89], [5.9, 0.88], [6.1, 0], [10, 10], [4.6, 0], [5.4, 0]]) ds_x = ds.array(x, block_size=(5, 2)) dbscan = DBSCAN(n_regions=10, eps=0.9, min_samples=4) dbscan.fit(ds_x) self.assertEqual(dbscan.n_clusters, 1)
def test_zero_samples(self): """ Tests DBSCAN fit when some regions contain zero samples. """ n_samples = 2 x, y = make_blobs(n_samples=n_samples, n_features=2, random_state=8) dbscan = DBSCAN(n_regions=3, eps=.2, max_samples=100) x = StandardScaler().fit_transform(x) ds_x = ds.array(x, block_size=(2, 2)) dbscan.fit(ds_x) self.assertEqual(dbscan.n_clusters, 0)
def test_n_clusters_moons_grid(self): """ Tests that DBSCAN finds the correct number of clusters when setting n_regions > 1 with moon data. """ n_samples = 1500 x, y = make_moons(n_samples=n_samples, noise=.05) dbscan = DBSCAN(n_regions=4, eps=.3, max_samples=600) x = StandardScaler().fit_transform(x) ds_x = ds.array(x, block_size=(300, 2)) dbscan.fit(ds_x) self.assertEqual(dbscan.n_clusters, 2)
def test_n_clusters_blobs_grid(self): """ Tests that DBSCAN finds the correct number of clusters when setting n_regions > 1 with blob data. """ n_samples = 1500 x, y = make_blobs(n_samples=n_samples, n_features=2, random_state=8) dbscan = DBSCAN(n_regions=4, eps=.3, max_samples=300) x = StandardScaler().fit_transform(x) ds_x = ds.array(x, block_size=(300, 2)) dbscan.fit(ds_x) self.assertEqual(dbscan.n_clusters, 3)
def test_n_clusters_circles_max_samples(self): """ Tests that DBSCAN finds the correct number of clusters when defining max_samples with circle data. """ n_samples = 1500 x, y = make_circles(n_samples=n_samples, factor=.5, noise=.05) dbscan = DBSCAN(n_regions=1, eps=.15, max_samples=500) x = StandardScaler().fit_transform(x) ds_x = ds.array(x, block_size=(300, 2)) dbscan.fit(ds_x) self.assertEqual(dbscan.n_clusters, 2)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--svmlight", help="read file in SVMlLight format", action="store_true") parser.add_argument("-dt", "--detailed_times", help="get detailed execution times (read and fit)", action="store_true") parser.add_argument("-e", "--epsilon", metavar="EPSILON", type=float, help="default is 0.5", default=0.5) parser.add_argument("-r", "--regions", metavar="N_REGIONS", type=int, help="number of regions to create", default=1) parser.add_argument("-d", "--dimensions", metavar="DIMENSIONS", type=str, help="comma separated dimensions to use in the grid", required=False) parser.add_argument("-x", "--max_samples", metavar="MAX_SAMPLES", type=int, help="maximum samples to process per task (" "default is 1000)", default=1000) parser.add_argument("-m", "--min_samples", metavar="MIN_SAMPLES", type=int, help="default is 5", default=5) parser.add_argument("-b", "--block_size", metavar="BLOCK_SIZE", type=str, help="two comma separated ints that represent the " "size of the blocks in which to divide the input " "data (default is 100,100)", default="100,100") parser.add_argument("-f", "--features", metavar="N_FEATURES", help="number of features of the input data " "(only for SVMLight files)", type=int, default=None, required=False) parser.add_argument("--dense", help="store data in dense format (only " "for SVMLight files)", action="store_true") parser.add_argument("--labeled", help="the last column of the input file " "represents labels (only for text " "files)", action="store_true") parser.add_argument("train_data", help="input file in CSV or SVMLight format", type=str) args = parser.parse_args() train_data = args.train_data s_time = time.time() read_time = 0 sparse = not args.dense bsize = args.block_size.split(",") block_size = (int(bsize[0]), int(bsize[1])) if args.svmlight: x, y = ds.load_svmlight_file(train_data, block_size, args.features, sparse) else: x = ds.load_txt_file(train_data, block_size) n_features = x.shape[1] if args.labeled and not args.svmlight: x = x[:, :n_features - 1] if args.detailed_times: compss_barrier() read_time = time.time() - s_time s_time = time.time() dims = range(args.features) if args.dimensions: dims = args.dimensions.split(",") dims = np.array(dims, dtype=int) dbscan = DBSCAN(eps=args.epsilon, min_samples=args.min_samples, max_samples=args.max_samples, n_regions=args.regions, dimensions=dims) dbscan.fit(x) compss_barrier() fit_time = time.time() - s_time out = [ dbscan.eps, dbscan.min_samples, dbscan.max_samples, dbscan.n_regions, len(dims), args.part_size, dbscan.n_clusters, read_time, fit_time ] print(out)