def test_fit(self): """ Tests that the fit method returns the expected centers using toy data. """ arr = np.array([[1, 2], [2, 1], [-1, -2], [-2, -1]]) x = ds.array(arr, block_size=(2, 2)) km = KMeans(n_clusters=2, random_state=666, verbose=False) km.fit(x) expected_centers = np.array([[1.5, 1.5], [-1.5, -1.5]]) self.assertTrue((km.centers == expected_centers).all())
def test_predict(self): """ Tests that labels are correctly predicted using toy data. """ p1, p2, p3, p4 = [1, 2], [2, 1], [-1, -2], [-2, -1] arr1 = np.array([p1, p2, p3, p4]) x = ds.array(arr1, block_size=(2, 2)) km = KMeans(n_clusters=2, random_state=666) km.fit(x) p5, p6 = [10, 10], [-10, -10] arr2 = np.array([p1, p2, p3, p4, p5, p6]) x_test = ds.array(arr2, block_size=(2, 2)) labels = km.predict(x_test).collect() expected_labels = np.array([0, 0, 1, 1, 0, 1]) self.assertTrue(np.array_equal(labels, expected_labels))
def test_fit_predict(self): """ Tests fit_predict.""" x, y = make_blobs(n_samples=1500, random_state=170) x_filtered = np.vstack( (x[y == 0][:500], x[y == 1][:100], x[y == 2][:10])) x_train = ds.array(x_filtered, block_size=(300, 2)) kmeans = KMeans(n_clusters=3, random_state=170) labels = kmeans.fit_predict(x_train).collect() skmeans = SKMeans(n_clusters=3, random_state=170) sklabels = skmeans.fit_predict(x_filtered) centers = np.array([[-8.941375656533449, -5.481371322614891], [-4.524023204953875, 0.06235042593214654], [2.332994701667008, 0.37681003933082696]]) self.assertTrue(np.allclose(centers, kmeans.centers)) self.assertTrue(np.allclose(labels, sklabels))
def main(): n_samples = 300000000 n_chunks = 1536 chunk_size = int(np.ceil(n_samples / n_chunks)) n_features = 100 n_clusters = 500 x = ds.random_array((n_samples, n_features), (chunk_size, n_features)) km = KMeans(n_clusters=n_clusters, max_iter=5, tol=0, arity=48) performance.measure("KMeans", "300M", km.fit, x)
def test_init_params(self): """ Tests that KMeans object correctly sets the initialization parameters """ n_clusters = 2 max_iter = 1 tol = 1e-4 seed = 666 arity = 2 init = "random" km = KMeans(n_clusters=n_clusters, max_iter=max_iter, tol=tol, arity=arity, random_state=seed) expected = (n_clusters, init, max_iter, tol, arity) real = (km._n_clusters, km._init, km._max_iter, km._tol, km._arity) self.assertEqual(expected, real)
def test_init(self): # With dense data x, y = make_blobs(n_samples=1500, random_state=170) x_filtered = np.vstack( (x[y == 0][:500], x[y == 1][:100], x[y == 2][:10])) x_train = ds.array(x_filtered, block_size=(300, 2)) init = np.random.random((5, 2)) km = KMeans(n_clusters=5, init=init) km.fit(x_train) self.assertTrue(np.array_equal(km._init, init)) self.assertFalse(np.array_equal(km.centers, init)) # With sparse data x_sp = ds.array(csr_matrix(x_filtered), block_size=(300, 2)) init = csr_matrix(np.random.random((5, 2))) km = KMeans(n_clusters=5, init=init) km.fit(x_sp) self.assertTrue(np.array_equal(km._init.toarray(), init.toarray())) self.assertFalse(np.array_equal(km.centers.toarray(), init.toarray()))
def test_sparse(self): """ Tests K-means produces the same results using dense and sparse data structures. """ file_ = "tests/files/libsvm/2" x_sp, _ = ds.load_svmlight_file(file_, (10, 300), 780, True) x_ds, _ = ds.load_svmlight_file(file_, (10, 300), 780, False) kmeans = KMeans(random_state=170) y_sparse = kmeans.fit_predict(x_sp).collect() sparse_c = kmeans.centers.toarray() kmeans = KMeans(random_state=170) y_dense = kmeans.fit_predict(x_ds).collect() dense_c = kmeans.centers self.assertTrue(np.allclose(sparse_c, dense_c)) self.assertTrue(np.array_equal(y_sparse, y_dense))
def test_kmeans(self): """ Tests K-means fit_predict and compares the result with regular ds-arrays """ config.session.execute("TRUNCATE TABLE hecuba.istorage") config.session.execute("DROP KEYSPACE IF EXISTS hecuba_dislib") x, y = make_blobs(n_samples=1500, random_state=170) x_filtered = np.vstack( (x[y == 0][:500], x[y == 1][:100], x[y == 2][:10])) block_size = (x_filtered.shape[0] // 10, x_filtered.shape[1]) x_train = ds.array(x_filtered, block_size=block_size) x_train_hecuba = ds.array(x=x_filtered, block_size=block_size) x_train_hecuba.make_persistent(name="hecuba_dislib.test_array") kmeans = KMeans(n_clusters=3, random_state=170) labels = kmeans.fit_predict(x_train).collect() kmeans2 = KMeans(n_clusters=3, random_state=170) h_labels = kmeans2.fit_predict(x_train_hecuba).collect() self.assertTrue(np.allclose(kmeans.centers, kmeans2.centers)) self.assertTrue(np.allclose(labels, h_labels))
def main(): """ Usage example copied from scikit-learn's webpage. """ plt.figure(figsize=(12, 12)) n_samples = 1500 random_state = 170 x, y = make_blobs(n_samples=n_samples, random_state=random_state) dis_x = ds.array(x, block_size=(300, 2)) # Incorrect number of clusters kmeans = KMeans(n_clusters=2, random_state=random_state) y_pred = kmeans.fit_predict(dis_x).collect() plt.subplot(221) plt.scatter(x[:, 0], x[:, 1], c=y_pred) centers = kmeans.centers plt.scatter(centers[:, 0], centers[:, 1], c="red") plt.title("Incorrect Number of Blobs") # Anisotropicly distributed data transformation = [[0.60834549, -0.63667341], [-0.40887718, 0.85253229]] x_aniso = np.dot(x, transformation) dis_x_aniso = ds.array(x_aniso, block_size=(300, 2)) kmeans = KMeans(n_clusters=3, random_state=random_state) y_pred = kmeans.fit_predict(dis_x_aniso).collect() plt.subplot(222) plt.scatter(x_aniso[:, 0], x_aniso[:, 1], c=y_pred) centers = kmeans.centers plt.scatter(centers[:, 0], centers[:, 1], c="red") plt.title("Anisotropicly Distributed Blobs") # Different variance x_varied, y_varied = make_blobs(n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state) dis_x_varied = ds.array(x_varied, block_size=(300, 2)) kmeans = KMeans(n_clusters=3, random_state=random_state) y_pred = kmeans.fit_predict(dis_x_varied).collect() plt.subplot(223) plt.scatter(x_varied[:, 0], x_varied[:, 1], c=y_pred) centers = kmeans.centers plt.scatter(centers[:, 0], centers[:, 1], c="red") plt.title("Unequal Variance") # Unevenly sized blobs x_filtered = np.vstack((x[y == 0][:500], x[y == 1][:100], x[y == 2][:10])) dis_x_filtered = ds.array(x_filtered, block_size=(300, 2)) kmeans = KMeans(n_clusters=3, random_state=random_state) y_pred = kmeans.fit_predict(dis_x_filtered).collect() plt.subplot(224) plt.scatter(x_filtered[:, 0], x_filtered[:, 1], c=y_pred) centers = kmeans.centers plt.scatter(centers[:, 0], centers[:, 1], c="red") plt.title("Unevenly Sized Blobs") plt.show()
def main(): np.random.seed(0) # ============ # Generate datasets. We choose the size big enough to see the scalability # of the algorithms, but not too big to avoid too long running times # ============ n_samples = 1500 noisy_circles = make_circles(n_samples=n_samples, factor=.5, noise=.05, random_state=170) noisy_moons = make_moons(n_samples=n_samples, noise=.05) blobs = make_blobs(n_samples=n_samples, random_state=8) no_structure = np.random.rand(n_samples, 2), None # Anisotropicly distributed data random_state = 170 X, y = make_blobs(n_samples=n_samples, random_state=random_state) transformation = [[0.6, -0.6], [-0.4, 0.8]] X_aniso = np.dot(X, transformation) aniso = (X_aniso, y) # blobs with varied variances varied = make_blobs(n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state) # ============ # Set up cluster parameters # ============ plt.figure(figsize=(9 * 2 + 3, 12.5)) plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05, hspace=.01) plot_num = 1 default_base = { 'quantile': .3, 'eps': .3, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 3 } datasets = [(noisy_circles, { 'damping': .77, 'preference': -240, 'quantile': .2, 'n_clusters': 2 }), (noisy_moons, { 'damping': .75, 'preference': -220, 'n_clusters': 2 }), (varied, { 'eps': .18, 'n_neighbors': 2 }), (aniso, { 'eps': .15, 'n_neighbors': 2 }), (blobs, {}), (no_structure, {})] for i_dataset, (dataset, algo_params) in enumerate(datasets): # update parameters with dataset-specific values params = default_base.copy() params.update(algo_params) X, y = dataset # normalize dataset for easier parameter selection X = StandardScaler().fit_transform(X) # ============ # Create cluster objects # ============ kmeans = KMeans(n_clusters=params["n_clusters"]) dbscan = DBSCAN(eps=params["eps"], n_regions=1) gm = GaussianMixture(n_components=params["n_clusters"]) clustering_algorithms = (('K-Means', kmeans), ('DBSCAN', dbscan), ('Gaussian mixture', gm)) for name, algorithm in clustering_algorithms: t0 = time.time() # catch warnings related to kneighbors_graph with warnings.catch_warnings(): warnings.filterwarnings("ignore", message="the number of connected " "components of the " "connectivity matrix is [" "0-9]{1,2} > 1. Completing " "it to avoid stopping the " "tree early.", category=UserWarning) warnings.filterwarnings("ignore", message="Graph is not fully " "connected, " "spectral " "embedding may not " "work as " "expected.", category=UserWarning) data = ds.array(X, block_size=(300, 2)) algorithm.fit(data) t1 = time.time() y_pred = algorithm.fit_predict(data).collect() plt.subplot(len(datasets), len(clustering_algorithms), plot_num) if i_dataset == 0: plt.title(name, size=18) colors = np.array( list( islice( cycle([ '#377eb8', '#ff7f00', '#4daf4a', '#f781bf', '#a65628', '#984ea3', '#999999', '#e41a1c', '#dede00' ]), int(max(y_pred) + 1)))) # add black color for outliers (if any) colors = np.append(colors, ["#000000"]) plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred]) plt.xlim(-2.5, 2.5) plt.ylim(-2.5, 2.5) plt.xticks(()) plt.yticks(()) plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'), transform=plt.gca().transAxes, size=15, horizontalalignment='right') plot_num += 1 plt.show()
def initialize(alg_names, args): return [{ 'KMeans': lambda x: KMeans(**get_kmeans_kwargs(x)), 'DBSCAN': lambda x: DBSCAN(**get_dbscan_kwargs(x)), 'GaussianMixture': lambda x: GaussianMixture(**get_gm_kwargs(x)) }[name](args) for name in alg_names]
def main(): parser = argparse.ArgumentParser() parser.add_argument("--svmlight", help="read files in SVMLight format", action="store_true") parser.add_argument("-dt", "--detailed_times", help="get detailed execution times (read and fit)", action="store_true") parser.add_argument("-a", "--arity", metavar="CASCADE_ARITY", type=int, help="default is 50", default=50) parser.add_argument("-c", "--centers", metavar="N_CENTERS", type=int, help="default is 2", default=2) parser.add_argument("-b", "--block_size", metavar="BLOCK_SIZE", type=str, help="two comma separated ints that represent the " "size of the blocks in which to divide the input " "data (default is 100,100)", default="100,100") parser.add_argument("-i", "--iteration", metavar="MAX_ITERATIONS", type=int, help="default is 5", default=5) parser.add_argument("-f", "--features", metavar="N_FEATURES", help="number of features of the input data " "(only for SVMLight files)", type=int, default=None, required=False) parser.add_argument("--dense", help="store data in dense format (only " "for SVMLight files)", action="store_true") parser.add_argument("--labeled", help="the last column of the input file " "represents labels (only for text " "files)", action="store_true") parser.add_argument("train_data", help="input file in CSV or SVMLight format", type=str) args = parser.parse_args() train_data = args.train_data s_time = time.time() read_time = 0 sparse = not args.dense bsize = args.block_size.split(",") block_size = (int(bsize[0]), int(bsize[1])) if args.svmlight: x, y = ds.load_svmlight_file(train_data, block_size, args.features, sparse) else: x = ds.load_txt_file(train_data, block_size) n_features = x.shape[1] if args.labeled and not args.svmlight: x = x[:, :n_features - 1] if args.detailed_times: barrier() read_time = time.time() - s_time s_time = time.time() kmeans = KMeans(n_clusters=args.clusters, max_iter=args.iteration, arity=args.arity, verbose=True) kmeans.fit(x) barrier() fit_time = time.time() - s_time out = [args.clusters, args.arity, args.part_size, read_time, fit_time] print(out)
def _initialize_parameters(self, x, random_state): """Initialization of the Gaussian mixture parameters. Parameters ---------- x : ds-array, shape=(n_samples, n_features) Data points. random_state : RandomState A random number generator instance. """ if self.weights_init is not None: self.weights_ = self.weights_init / np.sum(self.weights_init) if self.means_init is not None: self.means_ = self.means_init if self.precisions_init is not None: if self.covariance_type == 'full': self.precisions_cholesky_ = np.array( [linalg.cholesky(prec_init, lower=True) for prec_init in self.precisions_init]) elif self.covariance_type == 'tied': self.precisions_cholesky_ = linalg.cholesky( self.precisions_init, lower=True) else: self.precisions_cholesky_ = self.precisions_init initialize_params = (self.weights_init is None or self.means_init is None or self.precisions_init is None) if initialize_params: n_components = self.n_components resp_blocks = [] if self.init_params == 'kmeans': if self.verbose: print("KMeans initialization start") seed = random_state.randint(0, int(1e8)) kmeans = KMeans(n_clusters=n_components, random_state=seed, verbose=self.verbose) y = kmeans.fit_predict(x) self.kmeans = kmeans for y_part in y._iterator(axis=0): resp_blocks.append([_resp_subset(y_part._blocks, n_components)]) elif self.init_params == 'random': chunks = x._n_blocks[0] seeds = random_state.randint(np.iinfo(np.int32).max, size=chunks) for i, x_row in enumerate(x._iterator(axis=0)): resp_blocks.append([_random_resp_subset(x_row.shape[0], n_components, seeds[i])]) else: raise ValueError("Unimplemented initialization method '%s'" % self.init_params) resp = Array(blocks=resp_blocks, top_left_shape=(x._top_left_shape[0], n_components), reg_shape=(x._reg_shape[0], n_components), shape=(x.shape[0], n_components), sparse=False) weights, nk, means = self._estimate_parameters(x, resp) if self.means_init is None: self.means_ = means if self.weights_init is None: self.weights_ = weights if self.precisions_init is None: cov, p_c = _estimate_covariances(x, resp, nk, self.means_, self.reg_covar, self.covariance_type, self.arity) self.covariances_ = cov self.precisions_cholesky_ = p_c for resp_block in resp._blocks: compss_delete_object(resp_block)