def __kmeans(self, km_points_orig, nClusters): assert isinstance(km_points_orig, dict) assert isinstance(nClusters, int) and nClusters > 1 km = skKMeans(n_clusters=nClusters) # Get the ordered set of points (i.e. flower pixel percentages of each image) km_points = np.array([ k[1] for k in sorted(km_points_orig.items(), key=operator.itemgetter(1)) ]).reshape((-1, 1)) # Compute KMeans km.fit(km_points) # Get the centroids ordered km_centroids = list(km.cluster_centers_) km_centroids.sort() # Assign each image to a cluster final_img_clusters = {} for k, v in km_points_orig.items(): # Compute distance to each of the centroids dist = np.array([abs(v - q) for q in km_centroids]) # Get the closest centroid final_img_clusters[k] = int(dist.argmin()) return final_img_clusters
def test_speed_vs_sk(self): from sklearn.cluster import KMeans as skKMeans n_samples = 100000 centers = 10 X, true_labels = make_blobs(n_samples=n_samples, centers=centers, cluster_std=1., random_state=42) kmeans_h2o = KMeans(n_gpus=1, n_clusters=centers, random_state=42) # Warmup - during first call CUDA kernels take ~2sec to load kmeans_h2o.fit(X) start_h2o = time.time() kmeans_h2o.fit(X) end_h2o = time.time() kmeans_sk = skKMeans(n_init=1, n_clusters=centers, init='random', algorithm='full', n_jobs=-1) start_sk = time.time() kmeans_sk.fit(X) end_sk = time.time() assert end_h2o - start_h2o <= end_sk - start_sk
def __init__(self, dataset, n_classes): # Try using kmeans to work out clusters of results, so that we can pick # 'representatives' of each cluster to use as our kernel selection. # This provides classes to use in a decision tree classifier. kmeans = skKMeans(n_clusters=n_classes, random_state=0).fit(dataset.normalized) kernel_map = [ dataset.normalized.columns[np.argmax(vec)] for vec in kmeans.cluster_centers_ ] self.classes = kernel_map self.name = "{}{}".format(self.cls_name, n_classes)
def __init__(self, dataset, n_classes): data = dataset.normalized.reset_index(drop=True) pca = PCA(n_components=25) pca.fit(data) mu = data.mean(axis=0).to_numpy() transformed = pca.transform(data) kmeans = skKMeans(n_clusters=n_classes, random_state=0).fit(transformed) centroids = self._invert_pca(pca, mu, kmeans.cluster_centers_) kernel_map = [data.columns[np.argmax(vec)] for vec in centroids] self.classes = kernel_map self.name = "{}{}".format(self.cls_name, n_classes)
def test_accuracy(self): from sklearn.cluster import KMeans as skKMeans n_samples = 500000 centers = 10 X, true_labels = make_blobs(n_samples=n_samples, centers=centers, cluster_std=1., random_state=42) kmeans_h2o = KMeans(n_gpus=1, n_clusters=centers, random_state=42) kmeans_h2o.fit(X) kmeans_sk = skKMeans(n_init=1, n_clusters=centers, random_state=42) kmeans_sk.fit(X) accuracy_h2o = v_measure_score(kmeans_h2o.labels_, true_labels) accuracy_sk = v_measure_score(kmeans_sk.labels_, true_labels) # We also want to be either better or at most 10% worse than SKLearn # Everything else is horrible and we probably should fix something assert accuracy_h2o - accuracy_sk >= -0.1
def test_accuracy(self): from sklearn.cluster import KMeans as skKMeans n_samples = 100000 centers = 10 X, true_labels = make_blobs(n_samples=n_samples, centers=centers, cluster_std=1., random_state=42) kmeans_h2o = KMeans(n_gpus=1, n_clusters=centers, random_state=42) kmeans_h2o.fit(X) kmeans_sk = skKMeans(n_init=1, n_clusters=centers, init='random', random_state=42) kmeans_sk.fit(X) accuracy_h2o = v_measure_score(kmeans_h2o.labels_, true_labels) accuracy_sk = v_measure_score(kmeans_sk.labels_, true_labels) # We also want to be either better or at most 10% worse than SKLearn # Everything else is horrible and we probably should fix something assert accuracy_h2o - accuracy_sk >= -0.1
def test_speed_vs_sk(self): from sklearn.cluster import KMeans as skKMeans n_samples = 100000 centers = 10 X, true_labels = make_blobs(n_samples=n_samples, centers=centers, cluster_std=1., random_state=42) kmeans_h2o = KMeans(n_gpus=1, n_clusters=centers) start_h2o = time.time() kmeans_h2o.fit(X) end_h2o = time.time() kmeans_sk = skKMeans(n_init=1, n_clusters=centers, init='random') start_sk = time.time() kmeans_sk.fit(X) end_sk = time.time() print(end_h2o - start_h2o) print(end_sk - start_sk)
centers, labels_my_cluster = mdl.fit(data, k=3, max_iter=1000) plt.tight_layout() ## comparison of k-means cluster and its performance ## my k-mean cluster ax = fig.add_subplot(2, 3, 4) plot_data_kmeans(data, ax=ax, labels=labels_my_cluster, centers=centers, fnames=fnames) ax.set_title('My K-mean clustering') ## sklearn k-mean cluster labels_sk = skKMeans(n_clusters=3).fit(data).labels_ centers_sk = get_centers(data, labels_sk) ax = fig.add_subplot(2, 3, 5) plot_data_kmeans(data, ax=ax, labels=labels_sk, centers=centers_sk, fnames=fnames) ax.set_title('sklearn\'s K-mean clustering') ## truth ax = fig.add_subplot(2, 3, 6) labels_true = data_all.target centers_true = get_centers(data, labels_true) plot_data_kmeans(data, ax=ax,
def k_means(): """ KMeans """ print("[INFO] - KMeans - KMeans Classifier") model = skKMeans(n_clusters=clusters) return model