예제 #1
0
    def test_cluster_between_regions_2(self):
        """ Tests that DBSCAN can find clusters between regions. """
        x = np.array([[0, 0], [0.6, 0], [0.9, 0], [1.1, 0.2], [0.9, 0.6],
                      [1.1, 0.8], [1.4, 0.8], [2, 2]])
        ds_x = ds.array(x, block_size=(5, 2))

        dbscan = DBSCAN(n_regions=2, eps=0.5, min_samples=3)
        dbscan.fit(ds_x)
        self.assertEqual(dbscan.n_clusters, 1)
예제 #2
0
    def test_small_cluster_2(self):
        """ Tests that DBSCAN can find clusters with less than min_samples. """
        x = np.array([[0, 0], [0, 1], [1, 0], [3, 0], [5.1, 0], [6, 0], [6, 1],
                      [10, 10]])
        ds_x = ds.array(x, block_size=(5, 2))

        # n_regions=10
        dbscan2 = DBSCAN(n_regions=10, eps=2.5, min_samples=4)
        dbscan2.fit(ds_x)
        self.assertEqual(dbscan2.n_clusters, 2)
예제 #3
0
    def test_cluster_between_regions_1(self):
        """ Tests that DBSCAN can find clusters between regions. """
        x = np.array([[0, 0], [3.9, 0], [4.1, 0], [4.1, 0.89], [4.1, 0.88],
                      [5.9, 0], [5.9, 0.89], [5.9, 0.88], [6.1, 0], [10, 10],
                      [4.6, 0], [5.4, 0]])
        ds_x = ds.array(x, block_size=(5, 2))

        dbscan = DBSCAN(n_regions=10, eps=0.9, min_samples=4)
        dbscan.fit(ds_x)
        self.assertEqual(dbscan.n_clusters, 1)
예제 #4
0
 def test_zero_samples(self):
     """ Tests DBSCAN fit when some regions contain zero samples.
     """
     n_samples = 2
     x, y = make_blobs(n_samples=n_samples, n_features=2, random_state=8)
     dbscan = DBSCAN(n_regions=3, eps=.2, max_samples=100)
     x = StandardScaler().fit_transform(x)
     ds_x = ds.array(x, block_size=(2, 2))
     dbscan.fit(ds_x)
     self.assertEqual(dbscan.n_clusters, 0)
예제 #5
0
 def test_n_clusters_moons_grid(self):
     """ Tests that DBSCAN finds the correct number of clusters when
     setting n_regions > 1 with moon data.
     """
     n_samples = 1500
     x, y = make_moons(n_samples=n_samples, noise=.05)
     dbscan = DBSCAN(n_regions=4, eps=.3, max_samples=600)
     x = StandardScaler().fit_transform(x)
     ds_x = ds.array(x, block_size=(300, 2))
     dbscan.fit(ds_x)
     self.assertEqual(dbscan.n_clusters, 2)
예제 #6
0
 def test_n_clusters_blobs_grid(self):
     """ Tests that DBSCAN finds the correct number of clusters when
     setting n_regions > 1 with blob data.
     """
     n_samples = 1500
     x, y = make_blobs(n_samples=n_samples, n_features=2, random_state=8)
     dbscan = DBSCAN(n_regions=4, eps=.3, max_samples=300)
     x = StandardScaler().fit_transform(x)
     ds_x = ds.array(x, block_size=(300, 2))
     dbscan.fit(ds_x)
     self.assertEqual(dbscan.n_clusters, 3)
예제 #7
0
 def test_n_clusters_circles_max_samples(self):
     """ Tests that DBSCAN finds the correct number of clusters when
     defining max_samples with circle data.
     """
     n_samples = 1500
     x, y = make_circles(n_samples=n_samples, factor=.5, noise=.05)
     dbscan = DBSCAN(n_regions=1, eps=.15, max_samples=500)
     x = StandardScaler().fit_transform(x)
     ds_x = ds.array(x, block_size=(300, 2))
     dbscan.fit(ds_x)
     self.assertEqual(dbscan.n_clusters, 2)
예제 #8
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--svmlight",
                        help="read file in SVMlLight format",
                        action="store_true")
    parser.add_argument("-dt",
                        "--detailed_times",
                        help="get detailed execution times (read and fit)",
                        action="store_true")
    parser.add_argument("-e",
                        "--epsilon",
                        metavar="EPSILON",
                        type=float,
                        help="default is 0.5",
                        default=0.5)
    parser.add_argument("-r",
                        "--regions",
                        metavar="N_REGIONS",
                        type=int,
                        help="number of regions to create",
                        default=1)
    parser.add_argument("-d",
                        "--dimensions",
                        metavar="DIMENSIONS",
                        type=str,
                        help="comma separated dimensions to use in the grid",
                        required=False)
    parser.add_argument("-x",
                        "--max_samples",
                        metavar="MAX_SAMPLES",
                        type=int,
                        help="maximum samples to process per task ("
                        "default is 1000)",
                        default=1000)
    parser.add_argument("-m",
                        "--min_samples",
                        metavar="MIN_SAMPLES",
                        type=int,
                        help="default is 5",
                        default=5)
    parser.add_argument("-b",
                        "--block_size",
                        metavar="BLOCK_SIZE",
                        type=str,
                        help="two comma separated ints that represent the "
                        "size of the blocks in which to divide the input "
                        "data (default is 100,100)",
                        default="100,100")
    parser.add_argument("-f",
                        "--features",
                        metavar="N_FEATURES",
                        help="number of features of the input data "
                        "(only for SVMLight files)",
                        type=int,
                        default=None,
                        required=False)
    parser.add_argument("--dense",
                        help="store data in dense format (only "
                        "for SVMLight files)",
                        action="store_true")
    parser.add_argument("--labeled",
                        help="the last column of the input file "
                        "represents labels (only for text "
                        "files)",
                        action="store_true")
    parser.add_argument("train_data",
                        help="input file in CSV or SVMLight format",
                        type=str)
    args = parser.parse_args()

    train_data = args.train_data

    s_time = time.time()
    read_time = 0

    sparse = not args.dense

    bsize = args.block_size.split(",")
    block_size = (int(bsize[0]), int(bsize[1]))

    if args.svmlight:
        x, y = ds.load_svmlight_file(train_data, block_size, args.features,
                                     sparse)
    else:
        x = ds.load_txt_file(train_data, block_size)

    n_features = x.shape[1]

    if args.labeled and not args.svmlight:
        x = x[:, :n_features - 1]

    if args.detailed_times:
        compss_barrier()
        read_time = time.time() - s_time
        s_time = time.time()

    dims = range(args.features)

    if args.dimensions:
        dims = args.dimensions.split(",")
        dims = np.array(dims, dtype=int)

    dbscan = DBSCAN(eps=args.epsilon,
                    min_samples=args.min_samples,
                    max_samples=args.max_samples,
                    n_regions=args.regions,
                    dimensions=dims)
    dbscan.fit(x)

    compss_barrier()
    fit_time = time.time() - s_time

    out = [
        dbscan.eps, dbscan.min_samples, dbscan.max_samples, dbscan.n_regions,
        len(dims), args.part_size, dbscan.n_clusters, read_time, fit_time
    ]

    print(out)