def perform_spectral_clustering(self, no_clusters): spectral_clusterer = SpectralClustering(n_clusters=no_clusters) spectral_clusterer.fit(self.distance_matrix) self.spectral_results = { "parameters": spectral_clusterer.get_params(), "labels": spectral_clusterer.labels_, "n_clusters": np.unique(spectral_clusterer.labels_).max() + 1, "clusters": label_cnt_dict(spectral_clusterer.labels_) } print_dict(self.spectral_results) #gaussian kernel affinity matrix self.affinity_matrix = spectral_clusterer.affinity_matrix_
def do_spectral_clustering(target_csv): num_cluster = 24 df_data = pd.read_csv(os.path.join(CONFIG.CSV_PATH, target_csv + '.csv'), index_col=0, header=0, encoding='utf-8-sig') df_data.index.name = 'short_code' print(df_data.iloc[:100]) print(df_data.shape) start_time = time.time() ds_data = df_data.sample(frac=0.5) ns_data = df_data.loc[set(df_data.index) - set(ds_data.index)] clustering = SpectralClustering(n_clusters=num_cluster, random_state=42, assign_labels='discretize') clustering.fit(ds_data) print("time elapsed for clustering: " + str(time.time() - start_time)) print(clustering.get_params()) print(clustering.labels_) count_percentage(clustering.labels_) start_time = time.time() result_ds = pd.DataFrame(data=clustering.labels_, index=ds_data.index, columns=['cluster']) ns_label = clustering.fit_predict(ns_data) result_ns = pd.DataFrame(data=ns_label, index=ns_data.index, columns=['cluster']) result_df = pd.concat([result_ds, result_ns]) result_df.sort_index(inplace=True) print("time elapsed for prediction: " + str(time.time() - start_time)) start_time = time.time() print("calinski_harabasz_score: ", calinski_harabasz_score(df_data, result_df['cluster'].squeeze())) print("silhouette_score: ", silhouette_score(df_data, result_df['cluster'].squeeze())) print("davies_bouldin_score: ", davies_bouldin_score(df_data, result_df['cluster'].squeeze())) print("time elapsed for scoring: " + str(time.time() - start_time)) result_df.to_csv(os.path.join(CONFIG.CSV_PATH, 'clustered_spectral_' + target_csv + '.csv'), encoding='utf-8-sig')
def cluster(x: np.ndarray, y: np.ndarray) -> np.ndarray: """ Fits a clustering model. :param x: the x train values. :param y: the label values. :return: the clustering labels. """ logger.log('Creating model...') clustering = SpectralClustering(affinity='nearest_neighbors', n_clusters=5, n_neighbors=250, random_state=0, n_jobs=-1) clustering_params = clustering.get_params() logger.log('Applying Spectral Clustering with params: \n{}'.format( clustering_params)) logger.log('Fitting...') start_time = time.perf_counter() clustering.fit(x) end_time = time.perf_counter() logger.log('Model has been fit in {:.3} seconds.'.format(end_time - start_time)) if PLOTTING_MODE != 'none': # Plot resulting clusters. plotter.subfolder = 'graphs/Spectral Clustering/clusters' plotter.filename = 'after_LLE_c={}-n={}'.format( clustering_params['n_clusters'], clustering_params['n_neighbors']) plotter.xlabel = 'first feature' plotter.ylabel = 'second feature' plotter.title = 'Spectral Clustering after LLE\nClusters: {}, Neighbors: {}' \ .format(clustering_params['n_clusters'], clustering_params['n_neighbors']) plotter.scatter(x, clustering.labels_, clustering=True) # Plot classes compared to clusters. plotter.subfolder = 'graphs/Spectral Clustering/classes' plotter.scatter(x, y, clusters=clustering.labels_, class_labels=helpers.datasets.get_gene_name) return clustering.labels_
def _spectral_clustering(table, input_cols, prediction_col='prediction', n_clusters=8, eigen_solver=None, random_state=None, n_init=10, gamma=1.0, affinity='rbf', n_neighbors=10, eigen_tol=0.0, assign_labels='kmeans', degree=3, coef0=1): inputarr = table[input_cols] _eigen_solver = None if eigen_solver == 'None' else eigen_solver sc = SpectralClustering(n_clusters=n_clusters, eigen_solver=_eigen_solver, random_state=random_state, n_init=n_init, gamma=gamma, affinity=affinity, n_neighbors=n_neighbors, eigen_tol=eigen_tol, assign_labels=assign_labels, degree=degree, coef0=coef0) sc.fit(inputarr) label_name = { 'n_clusters': 'N Clusters', 'eigen_solver': 'Eigen Solver', 'random_state': 'Seed', 'n_init': 'N Init', 'gamma': 'Gamma', 'affinity': 'Affinity', 'n_neighbors': 'N Neighbors', 'eigen_tol': 'Eigen Tolerance', 'assign_labels': 'Assign Labels', 'degree': 'Degree', 'coef0': 'Zero Coefficient' } get_param = sc.get_params() param_table = pd.DataFrame.from_items( [['Parameter', list(label_name.values())], ['Value', [get_param[x] for x in list(label_name.keys())]]]) # cluster_centers = sc.cluster_centers_ labels = sc.labels_ colors = cm.nipy_spectral(np.arange(n_clusters).astype(float) / n_clusters) if len(input_cols) > 1: pca2_model = PCA(n_components=2).fit(inputarr) pca2 = pca2_model.transform(inputarr) fig_samples = _spectral_clustering_samples_plot( labels, table, input_cols, 100, n_clusters, colors) if len( table.index) > 100 else _spectral_clustering_samples_plot( labels, table, input_cols, None, n_clusters, colors) if len(input_cols) > 1: fig_pca = _spectral_clustering_pca_plot(labels, pca2_model, pca2, n_clusters, colors) rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Spectral Clustering Result | - Samples | {fig_samples} | {fig_pca} | ### Parameters | {params} """.format(fig_samples=fig_samples, fig_pca=fig_pca, params=pandasDF2MD(param_table)))) else: rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Mean Shift Result | - Samples | {fig_samples} | ### Parameters | {params} """.format(fig_samples=fig_samples, params=pandasDF2MD(param_table)))) model = _model_dict('spectral_clustering') model['model'] = sc model['input_cols'] = input_cols model['_repr_brtc_'] = rb.get() out_table = table.copy() out_table[prediction_col] = labels return {'out_table': out_table, 'model': model}
def perform_spectral_clustering(self, no_clusters, affinity='rbf', gamma=1.0, n_neighbors=10, pass_labels=False, n_init=10, force_manual=False): if force_manual: if not hasattr(self, "distance_matrix"): self.comp_distance_matrix() if affinity == 'rbf': self.affinity_matrix = np.exp(-gamma * self.distance_matrix**2) elif affinity == 'nearest_neighbors': self.affinity_matrix = kneighbors_graph( self.data, n_neighbors=n_neighbors, include_self=True).toarray() else: raise Exception("Affinity is NOT recognised as VALID ...") print("Computed Affinity Matrix ...") #laplacian matrix of graph lap, dd = laplacian(self.affinity_matrix, normed=True, return_diag=True) lap *= -1 print("Computed Graph Laplacian ...") lambdas, diffusion_map = np.linalg.eigh(lap) print("Performed Eigen-decomposition ...") embedding = diffusion_map.T[(self.n_samples - no_clusters):] * dd #deterministic vector flip sign = np.sign(embedding[range(embedding.shape[0]), np.argmax(np.abs(embedding), axis=1)]) embedding = embedding.T * sign if no_clusters == 2: visualise_2D( embedding.T[0], embedding.T[1], (self.class_labels) if pass_labels == True else None) elif no_clusters == 3: visualise_3D( embedding.T[0], embedding.T[1], embedding.T[2], (self.class_labels) if pass_labels == True else None) print("Performing K-Means clustering in eigen-space") kmeans_clusterer = KMeans(n_clusters=no_clusters, n_jobs=-1) kmeans_clusterer.fit(embedding) spectral_params = { "affinity": affinity, "gamma": gamma, "n_neighbors": n_neighbors, "n_init": n_init } self.spectral_results = { "parameters": spectral_params, "labels": kmeans_clusterer.labels_, "n_clusters": np.unique(kmeans_clusterer.labels_).max() + 1, "clusters": label_cnt_dict(kmeans_clusterer.labels_) } else: spectral_clusterer = SpectralClustering(n_clusters=no_clusters, gamma=gamma, affinity=affinity, n_neighbors=n_neighbors, n_init=n_init) spectral_clusterer.fit( self.data, y=(self.class_labels if pass_labels is True else None)) self.spectral_results = { "parameters": spectral_clusterer.get_params(), "labels": spectral_clusterer.labels_, "n_clusters": np.unique(spectral_clusterer.labels_).max() + 1, "clusters": label_cnt_dict(spectral_clusterer.labels_) } print_dict(self.spectral_results)