def form_biclusters(self): """generates spectral bi-clusters from self.data """ n_clusters = len(np.unique(self.labels)) print("Generating {} clusters".format(n_clusters)) self.bicluster = SpectralCoclustering(n_clusters=n_clusters, n_jobs=-1) self.bicluster.fit(self.data)
def spectral(Y, X, dtype=sp.float32): from sklearn.cluster import SpectralCoclustering def scale_normalize(X): " from https://github.com/scikit-learn/scikit-learn/blob/b194674c4/sklearn/cluster/_bicluster.py#L108" row_diag = sp.asarray(sp.sqrt(X.sum(axis=1))).squeeze() col_diag = sp.asarray(sp.sqrt(X.sum(axis=0))).squeeze() row_diag[row_diag == 0] = 1.0 col_diag[col_diag == 0] = 1.0 row_diag = 1.0 / row_diag col_diag = 1.0 / col_diag if smat.issparse(X): n_rows, n_cols = X.shape r = smat.dia_matrix((row_diag, [0]), shape=(n_rows, n_rows)) c = smat.dia_matrix((col_diag, [0]), shape=(n_cols, n_cols)) an = r * X * c else: an = row_diag[:, sp.newaxis] * X * col_diag return an, row_diag, col_diag coclustering = SpectralCoclustering(n_clusters=16384, random_state=1) normalized_data, row_diag, col_diag = scale_normalize(Y.T) n_sv = 1 + int(sp.ceil(sp.log2(coclustering.n_clusters))) u, v = coclustering._svd(normalized_data, n_sv, n_discard=1) label_embedding = smat.csr_matrix(u, dtype=dtype) return label_embedding
def test_spectral_coclustering(): # Test Dhillon's Spectral CoClustering on a simple problem. param_grid = { "svd_method": ["randomized", "arpack"], "n_svd_vecs": [None, 20], "mini_batch": [False, True], "init": ["k-means++"], "n_init": [10], } random_state = 0 S, rows, cols = make_biclusters((30, 30), 3, noise=0.5, random_state=random_state) S -= S.min() # needs to be nonnegative before making it sparse S = np.where(S < 1, 0, S) # threshold some values for mat in (S, csr_matrix(S)): for kwargs in ParameterGrid(param_grid): model = SpectralCoclustering(n_clusters=3, random_state=random_state, **kwargs) model.fit(mat) assert model.rows_.shape == (3, 30) assert_array_equal(model.rows_.sum(axis=0), np.ones(30)) assert_array_equal(model.columns_.sum(axis=0), np.ones(30)) assert consensus_score(model.biclusters_, (rows, cols)) == 1 _test_shape_indices(model)
def cluster(y, X, n_clusters): if n_clusters != 1 : y_ = onp.array(y,dtype='float64') # mask = onp.clip((onp.diff(y, n=1, axis=0) == 0).argmin(axis=0),a_max=2 * (y.shape[0] // 3),a_min=0) # for i in range(mask.size): # y_[range(mask[i]),i] = np.nan corr = pd.DataFrame(y_).corr(method='kendall') model = SpectralCoclustering(n_clusters=n_clusters) model.fit(corr) clusters = [model.get_indices(i)[0] for i in range(n_clusters)] def fn_by_cluster(x,fn,weights=None): if weights is not None: return np.concatenate([fn(x[..., rng], axis=-1, weights=weights[i])[..., np.newaxis] for i, rng in enumerate(clusters)], axis=-1) else: return np.concatenate([fn(x[..., rng], axis=-1)[..., np.newaxis] for i, rng in enumerate(clusters)],axis=-1) fn_market_share = lambda x: [np.sum(x[...,rng],axis=0)/np.sum(x[...,rng]) for rng in clusters] y_ = fn_by_cluster(y,np.sum) #Compute within-group market shares y_weights = fn_market_share(y) X_ = fn_by_cluster(X,np.average,y_weights) return y_, X_, clusters else: return np.sum(y,axis=-1)[...,np.newaxis],np.mean(X,axis=-1)[...,np.newaxis],1
def cluster(self, cluster_name): self.name = cluster_name.strip() print('cluster_name ' + self.name) if self.name == 'k-means': print('cluster_name: ' + self.name) self.clustering = KMeans(n_clusters=self.k, init='k-means++', max_iter=500, n_init=1) print("Clustering sparse data with %s" % self.clustering) t0 = time() self.clustering.fit(self.X) print("done in %0.3fs" % (time() - t0)) print() elif cluster_name == 'agglo': self.clustering = AgglomerativeClustering(n_clusters=self.k, affinity='euclidean', memory=None, connectivity=None, compute_full_tree='auto', linkage='ward', distance_threshold=None) print("Clustering sparse data with %s" % self.clustering) t0 = time() #to make dense matrix self.X = self.X.toarray() self.clustering.fit(self.X) print("done in %0.3fs" % (time() - t0)) print() elif self.name == 'spectral_cocluster': self.clustering = SpectralCoclustering(n_clusters=self.k,svd_method='arpack', random_state=0) print("Clustering sparse data with %s" % self.clustering) t0 = time() self.clustering.fit(self.X) print("done in %0.3fs" % (time() - t0)) print()
class Spectral(object): def __init__(self, dataset): """initialize spectral class Arguments: dataset {np.array} -- dataset to fetch """ data_map = {'classic3': 1, 'cstr': 3, 'mnist': 2} self.dataset = dataset print("Fetching ", dataset) self.data, self.labels = get_data_set(data_map[dataset]) if (~self.data.any(axis=0)).any(): print("Found empty features. deleting...") self.data = np.delete(self.data, np.where(~self.data.any(axis=0))[0], axis=1) def view_dataset(self, title, data, markersize=0.001): """plot data matrix Arguments: title {str} -- title of plot data {np.array} -- dataset to plot Keyword Arguments: markersize {float} -- size of datapoints (default: {0.001}) """ plt.spy(data, markersize=markersize) plt.title(title) plt.show() def shuffle_data(self): """shuffles self.data """ print("Shuffling") self.data, self.labels = shuffle(self.data, self.labels) self.view_dataset(data=self.data, title='shuffled data') def form_biclusters(self): """generates spectral bi-clusters from self.data """ n_clusters = len(np.unique(self.labels)) print("Generating {} clusters".format(n_clusters)) self.bicluster = SpectralCoclustering(n_clusters=n_clusters, n_jobs=-1) self.bicluster.fit(self.data) def get_accuracy(self): """calculates NMI between self.bicluster rows and data labels """ nmi = normalized_mutual_info_score(self.bicluster.row_labels_, self.labels) print("Accuracy is ", nmi) def show_clusters(self): """sorts data according to bicluster row and col labels and plots """ fit_data = self.data[np.argsort(self.bicluster.row_labels_)] fit_data = fit_data[:, np.argsort(self.bicluster.column_labels_)] self.view_dataset(data=fit_data, title='co-clusters')
def spect(input_data, n): spec_instance = SpectralCoclustering(n_clusters=n) spec_instance.fit(input_data) pred = spec_instance.row_labels_ print(pred) print("ACC: " + str(accuracy_score(pred, labels))) acc["SPECT" + str(n)] = str(accuracy_score(pred, labels)) tsnePlot(pred, n, input_data, 'SPECT')
def compute_coclustering( fit_data, num_clusters=1, tol_bicluster=0.005, # sparsity otherwise annoyingly causes underflows w/ sklearn ): if num_clusters == 1: num_clusters = min(fit_data.shape[0], 5) model = SpectralCoclustering(n_clusters=num_clusters, random_state=0) model.fit(fit_data + tol_bicluster) ordered_rows = np.argsort(model.row_labels_) ordered_cols = np.argsort(model.column_labels_) return (ordered_rows, ordered_cols, model.row_labels_[ordered_rows], model.column_labels_[ordered_cols])
def visualizeCorr(sgp, args): sgp.cpu() if args.file.split('/')[-2] == 'simulation': final_corr = data['corr'] allX = torch.tensor(data['data']).type(torch.float) allIid = data['iid'].reshape(-1) plt.figure() sns.heatmap( final_corr, cmap="YlGnBu", square=True, robust=True, xticklabels=False, yticklabels=False, ) corr = sgp.deepkernel(allX, allIid).detach().cpu().numpy() plt.figure() sns.heatmap( corr, cmap='YlGnBu', square=True, robust=True, xticklabels=False, yticklabels=False, ) plt.show() else: from sklearn.cluster import SpectralCoclustering indv_corr = sgp.deepkernel.indv_kernel(torch.arange( len(idMap))).detach().cpu().numpy() num_c = args.number_cluster model = SpectralCoclustering(n_clusters=num_c, random_state=0) model.fit(indv_corr) fit_data = indv_corr[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.row_labels_)] rows = np.random.permutation(np.arange(len(fit_data))) rows = rows[:3300] rows = np.sort(rows) clusterRes = model.row_labels_ cl = np.argsort(clusterRes) ax = sns.heatmap( indv_corr[cl][:, cl], cmap='YlGnBu', square=True, robust=True, xticklabels=False, yticklabels=False, ) plt.show()
class Cluster: def __init__(self, n_clusters, feature_vectors): self.n_clusters = n_clusters self.feature_vectors = feature_vectors def kmeans(self): self.model = KMeans(n_clusters=self.n_clusters) def agglomerative(self, linkage, affinity): self.model = AgglomerativeClustering( n_clusters=self.n_clusters, linkage=linkage, affinity=affinity) def birch(self): #acc is 0.87 self.model = Birch(n_clusters=self.n_clusters) def spectral(self, affinity, n_neighbors=None): self.model = SpectralClustering( n_clusters=self.n_clusters, affinity=affinity, n_neighbors=n_neighbors) def spectral_biclustering(self): self.model = SpectralBiclustering(n_clusters=self.n_clusters) def spectral_coclustering(self): self.model = SpectralCoclustering(n_clusters=self.n_clusters) def fit_model(self): # fit model and predict self.model.fit(self.feature_vectors) try: self.predicted_labels = self.model.labels_ except AttributeError: # spectral_biclustering and Coclustering print(self.model.row_labels_.shape) self.predicted_labels = self.model.row_labels_ except Exception: print(Exception) def save_result(self, file_path): np.savetxt('{}'.format(file_path), self.predicted_labels.astype(int), fmt='%i') def goodness(self, true_labels, base_precision, improved_precision, verbose=False): self.fit_model() # evaluate performance normalized_mutual_info = normalized_mutual_info_score( true_labels, self.predicted_labels) points = (normalized_mutual_info-base_precision)/improved_precision + 1 if verbose: print('current project can get {:d} points'.format(int(points))) return normalized_mutual_info
def bicluster_correlation_matrix(X, n_clusters=10, figsize=None): """ Group similar variables together by running Spectral coclustering algorithm on a dataset's correlation matrix. See https://bit.ly/2QgXZB2 for more details. Spectral coclustering finds groups of similar (row, column) subsets where each column can only belong to a single bicluster. This is different than "checkerboard" biclustering. Parameters ------------ X: {pd.DataFrame} numeric feature data. Shape {observations} x {features} n_clusters: {int} number of biclusters to construct figsize: {2-tuple of int} pyplot Figure size. Default [10, 6]. Returns ------------ coclust: {fitted sklearn.cluster.SpectralCoclustering object} """ # -- get estimate of correlation matrix using median-imputed version of data, # -- and then downsample to 50k datapoints for speed. num_df = X.iloc[np.random.choice(range(X.shape[0]) , size=min(100000, X.shape[0]) , replace=False)] cor_mat = num_df.fillna(num_df.median()).corr() # -- run coclustering. coclust = SpectralCoclustering(n_clusters=n_clusters , random_state=666) coclust.fit(cor_mat) # -- re-order correlation matrix by cluster indices. biclust_dat = cor_mat.iloc[np.argsort(coclust.row_labels_)] biclust_dat = biclust_dat.iloc[:, np.argsort(coclust.column_labels_)] # -- display biclustering pattern. fig = plt.figure(figsize=figsize if figsize else [10, 10]) ax = fig.add_subplot(111) ax = ax.matshow(biclust_dat , cmap='cool') ax.set_title(f'Correlation matrix post-biclustering: {n_clusters} clusters') ax.set_yticks(range(biclust_dat.shape[0])) ax.set_yticklabels(biclust_dat.index.tolist()) plt.show() return coclust
def rearrange_confusion_matrix(cm, n_clusters): from sklearn.cluster import SpectralCoclustering clst = SpectralCoclustering(n_clusters=n_clusters).fit(cm) idx = [] for c in range(n_clusters): idx.append(clst.get_indices(c)[0]) idx = np.concatenate(idx) cm_clustered = np.zeros(cm.shape, dtype=int) for i, idxi in enumerate(idx): for j, idxj in enumerate(idx): cm_clustered[i,j] = cm[idxi, idxj] return cm_clustered, idx
def cocluster(np_sums, matrix_diags, vectorizer): ''' Perform the coclustering ''' x = np.array(np_sums) # print(x) n_clusters = 20 clustering = SpectralCoclustering(n_clusters=n_clusters, random_state=0).fit(x) for i in range(n_clusters): row_nums, col_nums = clustering.get_indices(i) row_words = [matrix_diags[num] for num in row_nums] col_words = [vectorizer.get_feature_names()[num] for num in col_nums] print("Cluster: ", i) print("===========") print("Diagnoses: ", row_words) print() print("n-grams: ", col_words) print()
def plot_matrix(all_feature_names_arg, mat, filename, force_no_cocluster=False): print(datetime.datetime.now(), 'plot_matrix') print(' mat.shape=', mat.shape) plt.figure(figsize=(10, 4)) # set the x-axis to only include the biggest words if not args.no_biggest_words: l2_norms = np.linalg.norm(mat, axis=0, ord=args.norm) indices = l2_norms.argsort()[-args.num_words:] mat = mat[:, indices] all_feature_names = all_feature_names_arg words = [all_feature_names[i] for i in indices] plt.xticks(range(0, len(words)), words, rotation=-90) # cocluster the axes if not args.no_cocluster and not force_no_cocluster: clustering = SpectralCoclustering(n_clusters=6, random_state=1).fit(mat) col_indices = np.argsort(clustering.column_labels_) mat = mat[:, col_indices] try: words = [words[i] for i in col_indices] plt.xticks(range(0, len(words)), words, rotation=-90) except: pass # plot the figure plt.imshow(mat, aspect='auto', cmap='RdBu', norm=colors.SymLogNorm(linthresh=0.03, linscale=0.03, vmin=-1e6, vmax=1e6)) plt.yticks(ticks=[0, 1, 2, 3, 4, 5, 6, 7, 8], labels=model.classes_) plt.ylim(-0.5, 8.5) plt.colorbar() plt.tight_layout() plt.savefig(filename)
def spectral_clustering_experiment_helper(args): dataset, lr, seed, ModelClass, n_clusters = args np.random.seed(seed) choice_sets, choices, _ = dataset.load_pytorch() spec_clusters = SpectralCoclustering( n_clusters=n_clusters, random_state=seed).fit(choice_sets.squeeze().numpy()).row_labels_ rand_clusters = np.random.permutation(spec_clusters) spec_results = [] rand_results = [] n_items = choice_sets.size(1) for cluster in sorted(set(spec_clusters)): for clusters, results in zip([spec_clusters, rand_clusters], [spec_results, rand_results]): cluster_idx = clusters == cluster cluster_choice_sets = choice_sets[cluster_idx] cluster_choices = choices[cluster_idx] w = torch.ones(len(cluster_choices)) model = ModelClass(n_items) loss = fit(model, (cluster_choice_sets, cluster_choices, w), epochs=EPOCHS, learning_rate=lr, l2_lambda=L2_LAMBDA, show_progress=False) results.append((len(cluster_choices), loss, model.state_dict(), model.num_params)) return args, spec_results, rand_results
{ "n_components": 3, "n_best": 4 }, ], ) def test_errors(args): data = np.arange(25).reshape((5, 5)) model = SpectralBiclustering(**args) with pytest.raises(ValueError): model.fit(data) def test_wrong_shape(): model = SpectralBiclustering() data = np.arange(27).reshape((3, 3, 3)) with pytest.raises(ValueError): model.fit(data) @pytest.mark.parametrize("est", (SpectralBiclustering(), SpectralCoclustering())) def test_n_features_in_(est): X, _, _ = make_biclusters((3, 3), 3, random_state=0) assert not hasattr(est, "n_features_in_") est.fit(X) assert est.n_features_in_ == 3
noise=5, shuffle=False, random_state=0) plt.matshow(data, cmap=plt.cm.Blues) plt.title("Original dataset") # shuffle clusters rng = np.random.RandomState(0) row_idx = rng.permutation(data.shape[0]) col_idx = rng.permutation(data.shape[1]) data = data[row_idx][:, col_idx] plt.matshow(data, cmap=plt.cm.Blues) plt.title("Shuffled dataset") model = SpectralCoclustering(n_clusters=5, random_state=0) model.fit(data) score = consensus_score(model.biclusters_, (rows[:, row_idx], columns[:, col_idx])) print("consensus score: {:.3f}".format(score)) fit_data = data[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.column_labels_)] plt.matshow(fit_data, cmap=plt.cm.Blues) plt.title("After biclustering; rearranged to show biclusters") #plt.show()
def spectral_coclustering(tfidf_matrix, n_clusters=100): return SpectralCoclustering(n_clusters=n_clusters).fit(tfidf_matrix)
from bokeh.models import HoverTool, ColumnDataSource from bokeh.plotting import figure, output_file, show from bokeh.io import output_notebook import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.cluster import SpectralCoclustering whisky = pd.read_csv('whiskies.txt') whisky['Region'] = pd.read_csv('regions.txt') flavors = whisky.iloc[:, 2:14] corr_flavors = pd.DataFrame.corr(flavors) corr_whisky = pd.DataFrame.corr(flavors.transpose()) model = SpectralCoclustering(n_clusters=6, random_state=0) model.fit(corr_whisky) whisky['Group'] = pd.Series(model.row_labels_, index=whisky.index) whisky = whisky.iloc[np.argsort(model.row_labels_)] whisky = whisky.reset_index(drop=True) correlations = pd.DataFrame.corr(whisky.iloc[:, 2:14].transpose()) correlations = np.array(correlations) # First, we import a tool to allow text to pop up on a plot when the cursor # hovers over it. Also, we import a data structure used to store arguments # of what to plot in Bokeh. Finally, we will use numpy for this section as well! from bokeh.models import HoverTool, ColumnDataSource import numpy as np
predict.tail() # In[10]: # concatenate labels to df as a new column r = pd.concat([data, predict], axis=1) print(r) r.tail() # In[11]: import numpy as np from sklearn.cluster import SpectralCoclustering X = data.to_numpy() clustering = SpectralCoclustering(n_clusters=5, random_state=0).fit(X) clustering.row_labels_ #doctest: +SKIP clustering.column_labels_ #doctest: +SKIP clustering # In[12]: from sklearn.metrics import consensus_score from matplotlib import pyplot as plt # shuffle clusters rng = np.random.RandomState(0) row_idx = rng.permutation(X.shape[0]) col_idx = rng.permutation(X.shape[1])
def test_spectralcoclustering_parameter_validation(params, type_err, err_msg): """Check parameters validation in `SpectralBiClustering`""" data = np.arange(25).reshape((5, 5)) model = SpectralCoclustering(**params) with pytest.raises(type_err, match=err_msg): model.fit(data)
# exclude 'comp.os.ms-windows.misc' categories = [ 'alt.atheism', 'comp.graphics', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc' ] newsgroups = fetch_20newsgroups(categories=categories) y_true = newsgroups.target vectorizer = NumberNormalizingVectorizer(stop_words='english', min_df=5) cocluster = SpectralCoclustering(n_clusters=len(categories), svd_method='arpack', random_state=0) kmeans = MiniBatchKMeans(n_clusters=len(categories), batch_size=20000, random_state=0) print("Vectorizing...") X = vectorizer.fit_transform(newsgroups.data) print("Coclustering...") start_time = time() cocluster.fit(X) y_cocluster = cocluster.row_labels_ print("Done in {:.2f}s. V-measure: {:.4f}".format( time() - start_time, v_measure_score(y_cocluster, y_true)))
def spectral_coclustering(self): self.model = SpectralCoclustering(n_clusters=self.n_clusters)
categories = [ 'alt.atheism', 'comp.graphics', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc' ] newsgroups = fetch_20newsgroups(categories=categories) y_true = newsgroups.target vectorizer = NumberNormalizingVectorizer(stop_words='english', min_df=5) cocluster = SpectralCoclustering(n_clusters=len(categories), svd_method='arpack', random_state=0) kmeans = MiniBatchKMeans(n_clusters=len(categories), batch_size=20000, random_state=0) print("Vectoeizing...") X = vectorizer.fit_transform(newsgroups.data) print("Coclustering...") start_time = time() cocluster.fit(X) y_cocluster = cocluster.row_labels_ print("Done in {: .2}s. V-measure: {: .4f}".format( time() - start_time, v_measure_score(y_cocluster, y_true)))
#Prints out the grid shape of the genres print(visGrid.shape) print(len(Genre_ID_to_name.keys())) #Code that illustrates the heat map of co-occurring genre of movies annot_lookup = [] for i in range(len(nr_ids)): annot_lookup.append(Genre_ID_to_name[nr_ids[i]]) sns.heatmap(visGrid, xticklabels=annot_lookup, yticklabels=annot_lookup) plt.title("Heat map of Co-occurring Movie Genres") plt.show() #Bi-clustering to show genres that occur together and genres that don't occur together model = SpectralCoclustering(n_clusters=5) model.fit(visGrid) fit_data = visGrid[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.column_labels_)] annot_lookup_sorted = [] for i in np.argsort(model.row_labels_): annot_lookup_sorted.append(Genre_ID_to_name[nr_ids[i]]) sns.heatmap(fit_data, xticklabels=annot_lookup_sorted, yticklabels=annot_lookup_sorted, annot=False) plt.title("After biclustering; rearranged to show biclusters") plt.show()
for item in word[1:]: value = 100 * float(item) matrix[row_index][column_index] = value if value < min_list[column_index]: min_list[column_index] = value if value > max_list[column_index]: max_list[column_index] = value if value != 0: ave_list[column_index] += 1 # 此时ave_list复用表示非零元素个数 column_index += 1 print(unsta_max) print("row_num", row_num) # 跑对角线双聚类 每个基因打上0.。4 的标签 model = SpectralCoclustering(n_clusters=10, random_state=0) model.fit(matrix) for i in range(len(row_dict)): print(i, '.', row_list[i], ':', model.row_labels_[i]) print(model.column_labels_) fit_data = matrix[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.column_labels_)] # 临床变量随机森林 con_num = file9.readline().split().__len__() - 1 print("con_num:", con_num) lines = file9.readlines() sam_num = lines.__len__() print("sam_num:", sam_num) x_train = np.empty(shape=(sam_num, con_num), dtype=np.int)
class DocumentClustering: def __init__(self, k=5): self.name = 'k-means' self.k = k self.X = None self.clustering = None self.vectorizer = None self.dataset_size = 0 self.doc2vec_matrix = False def make_matrix(self, documents=None, n_components=-1, doc2vec_matrix=None): if isinstance(doc2vec_matrix, np.ndarray) == False: self.vectorizer = TfidfVectorizer() # self.vectorizer = CountVectorizer() self.X = self.vectorizer.fit_transform(documents) self.dataset_size = len(documents) else: self.X = doc2vec_matrix self.dataset_size = len(doc2vec_matrix) self.doc2vec_matrix = True if (n_components != -1): if n_components > len(self.vectorizer.get_feature_names()): n_components = len(self.vectorizer.get_feature_names()) print('n_components ' + str(n_components)) # Vectorizer results are normalized, which makes KMeans behave as # spherical k-means for better results. Since LSA/SVD results are # not normalized, we have to redo the normalization. print("Performing dimensionality reduction using LSA") t0 = time() svd = TruncatedSVD(n_components) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) self.X = lsa.fit_transform(self.X) print("done in %fs" % (time() - t0)) explained_variance = svd.explained_variance_ratio_.sum() print("Explained variance of the SVD step: {}%".format( int(explained_variance * 100))) print() def cluster(self, cluster_name): self.name = cluster_name.strip() print('cluster_name ' + self.name) if self.name == 'k-means': print('cluster_name: ' + self.name) self.clustering = KMeans(n_clusters=self.k, init='k-means++', max_iter=500, n_init=1) print("Clustering sparse data with %s" % self.clustering) t0 = time() self.clustering.fit(self.X) print("done in %0.3fs" % (time() - t0)) print() elif cluster_name == 'agglo': self.clustering = AgglomerativeClustering(n_clusters=self.k, affinity='euclidean', memory=None, connectivity=None, compute_full_tree='auto', linkage='ward', distance_threshold=None) print("Clustering sparse data with %s" % self.clustering) t0 = time() #to make dense matrix if self.doc2vec_matrix == False: self.X = self.X.toarray() self.clustering.fit(self.X) print("done in %0.3fs" % (time() - t0)) print() elif self.name == 'spectral_cocluster': self.clustering = SpectralCoclustering(n_clusters=self.k, svd_method='arpack', random_state=0) print("Clustering sparse data with %s" % self.clustering) t0 = time() self.clustering.fit(self.X) print("done in %0.3fs" % (time() - t0)) print() def print_results(self): # print the clustering result print(self.name) if self.name == 'k-means': cluster_labels = self.clustering.labels_ clustering_dict = self.clustering.__dict__ clusters = {} for document_id, cluster_label in enumerate(cluster_labels): if cluster_label not in clusters: clusters[cluster_label] = [] clusters[cluster_label].append(document_id) print(str(cluster_label) + " -- " + str(document_id)) order_centroids = self.clustering.cluster_centers_.argsort( )[:, ::-1] terms = self.vectorizer.get_feature_names() for i in range(self.k): print("Cluster %d:" % i, end='') for ind in order_centroids[i, :10]: print(' %s' % terms[ind], end='') print() elif self.name == 'agglo': cluster_labels = self.clustering.labels_ clustering_dict = self.clustering.__dict__ clusters = {} for document_id, cluster_label in enumerate(cluster_labels): if cluster_label not in clusters: clusters[cluster_label] = [] clusters[cluster_label].append(document_id) #print(str(cluster_label) + " -- " + str(document_id)) results = self.get_cluster_top_keywords(clusters) for _cluster in results: key_terms = results[_cluster] print("Cluster " + str(_cluster) + " : " + str(len(clusters[_cluster])) + " documents") print(key_terms) print() elif self.name == 'spectral_cocluster': target_number = 10 bicluster_ncuts = list( self.bicluster_ncut(i) for i in range(self.k)) best_idx = np.argsort(bicluster_ncuts)[:target_number] feature_names = self.vectorizer.get_feature_names() print() print("Best biclusters:") print("----------------") for idx, cluster in enumerate(best_idx): n_rows, n_cols = self.clustering.get_shape(cluster) cluster_docs, cluster_words = self.clustering.get_indices( cluster) if not len(cluster_docs) or not len(cluster_words): continue # categories counter = defaultdict(int) for i in cluster_docs: counter[str(i)] += 1 cat_string = ", ".join( "{:.0f}% {}".format(float(c) / n_rows * 100, name) for name, c in self.most_common(counter)[:3]) # words out_of_cluster_docs = self.clustering.row_labels_ != cluster out_of_cluster_docs = np.where(out_of_cluster_docs)[0] word_col = self.X[:, cluster_words] word_scores = np.array(word_col[cluster_docs, :].sum( axis=0) - word_col[out_of_cluster_docs, :].sum(axis=0)) word_scores = word_scores.ravel() important_words = list(feature_names[cluster_words[i]] for i in word_scores.argsort()[:-11:-1]) print("bicluster {} : {} documents, {} words".format( idx, n_rows, n_cols)) print("categories : {}".format(cat_string)) print("words : {}\n".format(', '.join(important_words))) def bicluster_ncut(self, i): rows, cols = self.clustering.get_indices(i) if not (np.any(rows) and np.any(cols)): import sys return sys.float_info.max row_complement = np.nonzero(np.logical_not( self.clustering.rows_[i]))[0] col_complement = np.nonzero(np.logical_not( self.clustering.columns_[i]))[0] # Note: the following is identical to X[rows[:, np.newaxis], # cols].sum() but much faster in scipy <= 0.16 weight = self.X[rows][:, cols].sum() cut = (self.X[row_complement][:, cols].sum() + self.X[rows][:, col_complement].sum()) return cut / weight def most_common(self, d): """Items of a defaultdict(int) with the highest values. """ return sorted(d.items(), key=operator.itemgetter(1), reverse=True) def get_cluster_top_keywords(self, clusters, keywords_per_cluster=10): """Shows the top k words for each cluster Keyword Arguments: keywords_per_cluster {int} -- The k words to show for each cluster (default: {10}) Returns: dict of lists -- Returns a dict of {cluster_id: ['top', 'k', 'words', 'for', 'cluster']} """ terms = self.vectorizer.get_feature_names() out = {} docs_for_cluster = {} # self.clusters = 10 clusters,containing the index of the document_vectors document in that cluster, ex len(self.clusters[6]) == 508 for cluster in clusters: # To flatten/combine all documents into one docs_for_cluster[cluster] = np.array( [self.X[i] for i in clusters[cluster]]) # Cluster vectors to feature words out[cluster] = np.array(terms)[np.flip( np.argsort(docs_for_cluster[cluster]), -1)] cluster_shape = out[cluster].shape out[cluster] = out[cluster].reshape( cluster_shape[0] * cluster_shape[1])[:keywords_per_cluster].tolist() return out def visualize(self): # The output is a one-dimensional array of N documents corresponding to the clusters # assigned to our N data points. if self.name == 'spectral_cocluster': pca_t = None if self.doc2vec_matrix == False: pca_t = PCA().fit_transform(self.X.toarray()) else: pca_t = PCA().fit_transform(self.X) #pca_t = PCA().fit_transform(self.X) # print(self.clustering.labels_) plt.scatter(pca_t[:, 0], pca_t[:, 1], c=self.clustering.row_labels_, cmap='rainbow') plt.show() elif self.name == 'agglo': pca_t = PCA().fit_transform(self.X) plt.scatter(pca_t[:, 0], pca_t[:, 1], c=self.clustering.labels_, cmap='rainbow') plt.show() elif self.name == 'k-means': if self.doc2vec_matrix == False: self.X = self.X.toarray() pca_t = PCA().fit_transform(self.X) # print(self.clustering.labels_) plt.scatter(pca_t[:, 0], pca_t[:, 1], c=self.clustering.labels_, cmap='rainbow') plt.show()
def spectral_biclustering(self): self.model = SpectralBiclustering(n_clusters=self.n_clusters)
''' print(corr_whisky) plt.figure(figsize = (10,10)) plt.pcolor(corr_whisky) plt.axis("tight") plt.colorbar() plt.show() ''' # Spectral co clustering from sklearn.cluster import SpectralCoclustering model = SpectralCoclustering(n_clusters= 6, random_state= 0) model.fit(corr_whisky) # Data from the correlation matrix # Every row corresponds to the cluster, every column # to the data parameter print( np.sum(model.rows_, axis= 1) ) # Sumamos las columnas # How many clusters belonging from each element print( np.sum(model.rows_, axis= 0) ) # Each element from the array positions belongs from the number # from this position print(model.row_labels_) # Comparing the correlation tables
def type_consistent_cocluster(topic_word_dict0, ename2embed_bert, n_cluster_min, print_cls=False, save_file=None): topic_word_dict = {} all_words = [] for topic in topic_word_dict0: topic_word_dict[topic] = [] for ename in topic_word_dict0[topic]: if ename in ename2embed_bert: topic_word_dict[topic].append(ename) all_words.append(ename) topics = list(topic_word_dict0.keys()) # print("topics") # print(topics) all_children = [x for x in all_words] # all_words.extend([x for x in topics if x in ename2embed_bert]) all_embed = [ename2embed_bert[x][0] for x in all_words] # print(all_children) all_words_and_their_parents = [] for word in all_words: for topic in topic_word_dict: if word in topic_word_dict[topic]: word0 = (topic, word) break all_words_and_their_parents.append(word0) # print(all_words_and_their_parents) # AP clustering = AffinityPropagation().fit(all_embed) n_clusters = max(clustering.labels_) + 1 clusters = {} col_vectors = np.zeros((len(topic_word_dict), n_clusters), dtype=float) for i in range(n_clusters): clusters[i] = [ all_words_and_their_parents[x] for x in range(len(clustering.labels_)) if clustering.labels_[x] == i ] for word0 in clusters[i]: word0_col = int(word0[0]) col_vectors[word0_col, i] = 1 col_vectors = np.array(col_vectors) col_vectors += 0.1 * np.ones((len(topic_word_dict), n_clusters), dtype=int) for n_cluster in range(n_cluster_min, n_cluster_min + 10): model = SpectralCoclustering(n_clusters=n_cluster, random_state=0) model.fit(col_vectors) new_topic_word_dict = {} coverage_list = [] for ind in range(n_cluster): # print(ind) small_matrix = col_vectors[[ x for x in range(len(model.row_labels_)) if model.row_labels_[x] == ind ]] small_matrix = small_matrix[:, [ x for x in range(len(model.column_labels_)) if model.column_labels_[x] == ind ]] coverage_list.append( np.sum(small_matrix) / np.sum(np.ones_like(small_matrix))) if max(coverage_list) >= 0.7: break fit_data = col_vectors[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.column_labels_)] cluster_count = [sum(model.row_labels_ == x) for x in range(n_cluster)] # print("row cluster count: ", cluster_count) cluster_count = [sum(model.column_labels_ == x) for x in range(n_cluster)] # print("column cluster count: ", cluster_count) coverage_thre = min(max(coverage_list), 0.4) # print('coverage: ',coverage_list) for ind in range(n_cluster): if coverage_list[ind] < coverage_thre: # print("del cluster ",ind) continue for topic in topic_word_dict: if model.row_labels_[int(topic)] == ind: new_topic_word_dict[topic] = [ x for x in topic_word_dict[topic] ] return new_topic_word_dict