def create_model(data, n_clusters, method, random): model = SpectralBiclustering(n_clusters=n_clusters, method=method, random_state=random) model.fit(data) return model
def test_fit_best_piecewise(): model = SpectralBiclustering(random_state=0) vectors = np.array([[0, 0, 0, 1, 1, 1], [2, 2, 2, 3, 3, 3], [0, 1, 2, 3, 4, 5]]) best = model._fit_best_piecewise(vectors, n_best=2, n_clusters=2) assert_array_equal(best, vectors[:2])
def test_project_and_cluster(): model = SpectralBiclustering(random_state=0) data = np.array([[1, 1, 1], [1, 1, 1], [3, 6, 3], [3, 6, 3]]) vectors = np.array([[1, 0], [0, 1], [0, 0]]) for mat in (data, csr_matrix(data)): labels = model._project_and_cluster(data, vectors, n_clusters=2) assert_array_equal(labels, [0, 0, 1, 1])
def test_spectral_biclustering(): """Test Kluger methods on a checkerboard dataset.""" param_grid = {'method': ['scale', 'bistochastic', 'log'], 'svd_method': ['randomized', 'arpack'], 'n_svd_vecs': [None, 20], 'mini_batch': [False, True], 'init': ['k-means++'], 'n_init': [10], 'n_jobs': [1]} random_state = 0 S, rows, cols = make_checkerboard((30, 30), 3, noise=0.5, random_state=random_state) for mat in (S, csr_matrix(S)): for kwargs in ParameterGrid(param_grid): model = SpectralBiclustering(n_clusters=3, random_state=random_state, **kwargs) if issparse(mat) and kwargs['method'] == 'log': # cannot take log of sparse matrix assert_raises(ValueError, model.fit, mat) continue else: model.fit(mat) assert_equal(model.rows_.shape, (9, 30)) assert_equal(model.columns_.shape, (9, 30)) assert_array_equal(model.rows_.sum(axis=0), np.repeat(3, 30)) assert_array_equal(model.columns_.sum(axis=0), np.repeat(3, 30)) assert_equal(consensus_score(model.biclusters_, (rows, cols)), 1)
def run(self, data): bc = SpectralBiclustering(n_clusters=(self.n_gene_classes, self.n_classes)) bc.fit(data) gene_clusters = bc.row_labels_ cell_clusters = bc.column_labels_ return cell_clusters
def biclustering(matrix, distance, callback=None): if min(matrix.shape) <= 2: return np.arange(matrix.shape[0]), np.arange(matrix.shape[1]) best_score = np.iinfo(np.dtype('uint16')).max best_model = None # find the best biclusters (needs revision) limit = int(min(matrix.shape) / 2) - 1 limit = 3 if limit < 3 else limit for i in range(2, limit): if callback is not None: callback(0.2 + (i - 2) / (limit - 2) * 0.8) # perform biclustering model = SpectralBiclustering( n_clusters=i, method='log', random_state=0) model.fit(matrix) fit_data = matrix[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.column_labels_)] # calculate score and save the lowest one score = distance(fit_data) if score < best_score: best_score = score best_model = model return np.argsort(best_model.row_labels_), np.argsort(best_model.column_labels_)
def fit_predict(self, D): """Run ConsensusClustering algorithm on data D. Return partition of input data and consensus matrix for best k. """ # number of samples n = D.shape[0] # AUC score for each k AUC_scores = np.zeros(len(self.num_clusters)) i = 0 for k in self.num_clusters: M = self.calc_consensus(n, D, k) AUC_scores[i] = self.calc_auc(M) i = i + 1 # find best number of clusters (k_best) idx_k_best = np.argmax(AUC_scores) k_best = K[idx_k_best] # uncomment to see the best k for given input data #print("Best number of clusters (k): ", k_best) M_k_best = self.calc_consensus(n, D, k_best) # partition D into K-best clusters based on M_k_best using # SpectralBiclustering model = SpectralBiclustering(n_clusters=k_best, method='bistochastic') model.fit(M_k_best) P = model.row_labels_ return P, M_k_best
def test_spectral_biclustering(): """Test Kluger methods on a checkerboard dataset.""" param_grid = {'method': ['scale', 'bistochastic', 'log'], 'svd_method': ['randomized', 'arpack'], 'n_svd_vecs': [None, 20], 'mini_batch': [False, True], 'init': ['k-means++'], 'n_init': [3], 'n_jobs': [1]} random_state = 0 S, rows, cols = make_checkerboard((30, 30), 3, noise=0.5, random_state=random_state) for mat in (S, csr_matrix(S)): for kwargs in ParameterGrid(param_grid): model = SpectralBiclustering(n_clusters=3, random_state=random_state, **kwargs) if issparse(mat) and kwargs['method'] == 'log': # cannot take log of sparse matrix assert_raises(ValueError, model.fit, mat) continue else: model.fit(mat) assert_equal(model.rows_.shape, (9, 30)) assert_equal(model.columns_.shape, (9, 30)) assert_array_equal(model.rows_.sum(axis=0), np.repeat(3, 30)) assert_array_equal(model.columns_.sum(axis=0), np.repeat(3, 30)) assert_equal(consensus_score(model.biclusters_, (rows, cols)), 1)
def fi_selection_algo(metadata, settings, X, target_atts_list=None): fi_scores = get_fi_scores(X, target_atts_list, metadata) n_clusters = (int(settings["selection"]["param"]), 2) model = SpectralBiclustering(n_clusters=n_clusters, method="log") model.fit(fi_scores) cluster_labels = model.row_labels_ codes = labels_to_codes(cluster_labels, target_atts_list) return codes
def plotBicluster(df, n_clusters): model = SpectralBiclustering(n_clusters=n_clusters, method='log', random_state=0) model.fit(df) fitDf = df.iloc[np.argsort(model.row_labels_), :] fitDf = fitDf.iloc[:, np.argsort(model.column_labels_)] plotCorrHeatmap(dmat=fitDf) return fitDf
def plotBicluster(df, n_clusters, col_labels=None): model = SpectralBiclustering(n_clusters=n_clusters, method='log', random_state=0) model.fit(df) fitDf = df.iloc[np.argsort(model.row_labels_),:] fitDf = fitDf.iloc[:, np.argsort(model.column_labels_)] plotCorrHeatmap(dmat=fitDf, col_labels=col_labels) return fitDf
def fit_data_to_model(self,shapey): model = SpectralBiclustering(n_clusters=shapey, method='log',random_state=0) model.fit(self.data) self.fit_data = self.data[np.argsort(model.row_labels_)] self.fit_data = self.fit_data[:, np.argsort(model.column_labels_)] self.rowl = model.row_labels_ self.coll = model.column_labels_ self.shapex = shapey
def get_bicluster(self, data): # Biclustering model = SpectralBiclustering(n_clusters=data.shape[1], random_state=0) print(data.sum(axis=0)) print(data.sum(axis=1)) model.fit(data.fillna(0)) fit_data = data.iloc[np.argsort(model.row_labels_)] fit_data = fit_data.iloc[:, np.argsort(model.column_labels_)] return fit_data
def SpectralBiCluster(data, n_clusters=(4, 4)): from sklearn.datasets import make_checkerboard from matplotlib import pyplot as plt from sklearn.cluster.bicluster import SpectralBiclustering model = SpectralBiclustering(method='log', random_state=0) data = np.array(data) model.fit(data) fit_data = data[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.column_labels_)] plt.matshow(fit_data, cmap=plt.cm.Blues)
def fit_data_to_model(self, shapey): model = SpectralBiclustering(n_clusters=shapey, method='log', random_state=0) model.fit(self.data) self.fit_data = self.data[np.argsort(model.row_labels_)] self.fit_data = self.fit_data[:, np.argsort(model.column_labels_)] self.rowl = model.row_labels_ self.coll = model.column_labels_ self.shapex = shapey
def test_errors(): data = np.arange(25).reshape((5, 5)) model = SpectralBiclustering(n_clusters=(3, 3, 3)) assert_raises(ValueError, model.fit, data) model = SpectralBiclustering(n_clusters='abc') assert_raises(ValueError, model.fit, data) model = SpectralBiclustering(n_clusters=(3, 'abc')) assert_raises(ValueError, model.fit, data) model = SpectralBiclustering(method='unknown') assert_raises(ValueError, model.fit, data) model = SpectralBiclustering(svd_method='unknown') assert_raises(ValueError, model.fit, data) model = SpectralBiclustering(n_components=0) assert_raises(ValueError, model.fit, data) model = SpectralBiclustering(n_best=0) assert_raises(ValueError, model.fit, data) model = SpectralBiclustering(n_components=3, n_best=4) assert_raises(ValueError, model.fit, data) model = SpectralBiclustering() data = np.arange(27).reshape((3, 3, 3)) assert_raises(ValueError, model.fit, data)
def plot_biclustering_with_pearson(time_ms, title): sliced_matrix = slice_matrix(matrix, time_ms) channels_data = calculate_n_columns(sliced_matrix) z_score = stats.zscore(channels_data) plt.title('Z Score Biclustering Over %i ms' % time_ms) spectral_model = SpectralBiclustering() spectral_model.fit(z_score) fit_data = z_score[np.argsort(spectral_model.row_labels_)] fit_data = fit_data[:, np.argsort(spectral_model.column_labels_)] plt.matshow(fit_data, cmap=plt.cm.Blues) plt.savefig('z_score_%s_biclustering_all_ts_%i.svg' % (time_ms, title))
def biclustering(filtered, checked) : ### over 2 if len(filtered['data']) >= 2 : n_clusters = (2, 2) else : n_clusters = (1, 1) model = SpectralBiclustering(n_clusters=n_clusters, method='log', random_state=0) data = np.asarray(filtered['data']) model.fit(data) #biclustering y_fit_data = data[np.argsort(model.row_labels_)] fit_data = y_fit_data[:, np.argsort(model.column_labels_)] #set y label y = np.argsort(model.row_labels_) y_label = [0 for i in range(len(y))] for n in range(len(y)) : y_label[y[n]] = n #set x label x = np.argsort(model.column_labels_) x_label = [0 for i in range(len(x))] for n in range(len(x)) : x_label[x[n]] = n d1 = bd.draw_graph(group1, group2, checked, x = x, x_label = x_label, y_label = y_label, fit_data = fit_data, genus_data = filtered['genus'], pvalue_label = filtered['pvalue'], title = "After biclustering") d1.draw() # biclustering of fixed x-axis domain d2 = bd.draw_graph(group1, group2, checked, x_label = [i for i in range(len(group1+group2))], y_label = y_label, x = [i for i in range(len(group1+group2))], fit_data = y_fit_data, genus_data = filtered['genus'], pvalue_label = filtered['pvalue'], title = "After biclustering; fixed x domins") d2.draw()
def spectral_biclust(E, ngenes=3, nconditions=1, spectral_method="bistochastic", n=6, n_best_ratio=0.5, **kwargs): n_best = max([int(n*n_best_ratio), 1]) spectral = SpectralBiclustering(n_clusters=(nconditions,ngenes), method=spectral_method, n_components=n, n_best=n_best) spectral.fit(standardize(E)) bics = [] for columns, rows in zip(spectral.columns_, spectral.rows_): genes = E.columns[columns] conditions = E.index[rows] bics.append(Bicluster(genes, conditions)) return bics
def spectral_bi_cluster(data, n_clusters, para_jobs=1, random_state=None): from sklearn.cluster.bicluster import SpectralBiclustering assert len( n_clusters ) == 2, "n_cluster should be a tuple or list that contains 2 integer!" model = SpectralBiclustering(n_clusters, random_state=random_state, n_jobs=para_jobs, method='bistochastic', n_best=20, n_components=40) model.fit(data) row_labels = model.row_labels_ col_labels = model.column_labels_ return row_labels, col_labels
def _spectral_bicluster(self, n_clusters, interaction_matrix): clustering = SpectralBiclustering(n_clusters=n_clusters, random_state=0).fit( self.interaction_matrix) pdz_clusters = clustering.row_labels_ peptide_clusters = clustering.column_labels_ return pdz_clusters, peptide_clusters
def plot_biclustering_raw_data(time_ms, t=False): # take the transpose of sliced matrix if t: channels_data = slice_matrix(matrix, time_ms).T else: channels_data = slice_matrix(matrix, time_ms) print len(channels_data), len(channels_data[1]) z_score = stats.zscore(channels_data) plt.title('Z Score Biclustering Over %i ms' % time_ms) spectral_model = SpectralBiclustering() spectral_model.fit(z_score) fit_data = z_score[np.argsort(spectral_model.row_labels_)] fit_data = fit_data[:, np.argsort(spectral_model.column_labels_)] plt.matshow(fit_data, cmap=plt.cm.Blues) # plt.savefig('z_score_raw_biclustering_all_ts_%i_T_%s.svg' % (time_ms, str(t))) plt.show()
def spectral(dataset_name, full, preprocessing, mindf, k1, k2, ngram_min, ngram_max, start, end, n): if not spectral_directory_exists(dataset_name): create_spectral_directory(dataset_name) h, c = obtain_file_name_from_dataset(dataset_name, preprocessing) corpus = obtain_full_corpus(h, c) if full: texts = corpus.text.values docnames = corpus.text.index.values if not tfidf_exists(dataset_name, preprocessing): X, v = create_tfidf(texts, mindf, ngram_min, ngram_max) words = v.get_feature_names() store_data(dataset_name, preprocessing, X, docnames, words) tfidf, documents, terms = load_data(dataset_name, preprocessing) if not spectral_exists(get_directory_dataset(dataset_name), dataset_name, preprocessing, mindf, k1, k2, ngram_min, ngram_max): start = time.time() model = SpectralBiclustering(n_clusters=(k1, k2), random_state=0) model.fit(tfidf) end = time.time() print("Biclustering process takes", int(round(end - start)), "seconds") save_clasification(get_directory_dataset(dataset_name), dataset_name, preprocessing, mindf, k1, k2, ngram_min, ngram_max, model) else: time_corpus = split_data_in_time_slices(corpus, start, end, n) if not tfidf_periods_exists(dataset_name, preprocessing, start, end, n): os.makedirs( get_directory_dataset_periods(dataset_name, preprocessing, start, end, n)) for (s, e), corp in time_corpus.items(): texts = corp.text.values docnames = corp.text.index.values X, v = create_tfidf(texts, mindf, ngram_min, ngram_max) words = v.get_feature_names() store_data_periods(dataset_name, preprocessing, start, end, n, s, e, X, docnames, words) for s, e in time_corpus: tfidf, documents, terms = load_data_periods( dataset_name, preprocessing, start, end, n, s, e) if not spectral_periods_exists(dataset_name, preprocessing, mindf, k1, k2, ngram_min, ngram_max, start, end, n, s, e): st = time.time() model = SpectralBiclustering(n_clusters=(k1, k2), random_state=0) model.fit(tfidf) ed = time.time() print("Biclustering process takes", int(round(ed - st)), "seconds") save_clasification_periods(dataset_name, preprocessing, mindf, k1, k2, ngram_min, ngram_max, model, start, end, n, s, e)
def plot_biclusters_n_intervals(n_intervals=30000): channels_data = [[] for i in range(64)] for row in range(64): start, end = 0, n_intervals row_data = matrix[row] while end < len(row_data): channels_data[row].append(float(sum(row_data[start:end])) / len(row_data[start:end])) start = end end += n_intervals z_score = stats.zscore(np.array(channels_data)) plt.title('Z Score Biclustering') spectral_model = SpectralBiclustering() spectral_model.fit(z_score) fit_data = z_score[np.argsort(spectral_model.row_labels_)] fit_data = fit_data[:, np.argsort(spectral_model.column_labels_)] plt.matshow(fit_data, cmap=plt.cm.Blues) plt.savefig('z_score_raw_biclustering_all_%is.svg' % (n_intervals / 1000))
def spectral_biclustering(cls, *args): """ Wrapper method for the spectral_biclustering algorithm :param args: the arguments to be sent to the sci-kit implementation :return: returns the Biclustering object """ model = SpectralBiclustering(*args) return cls(model)
def spectral_cluster(dataframe, n_clusters=(30, 30), show_plots=False): model = SpectralBiclustering(n_clusters=n_clusters, method='log', random_state=0) data = dataframe.fillna(0.0).values model.fit(data) fit_data = data[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.column_labels_)] if show_plots: plt.matshow(fit_data, cmap=plt.cm.Blues) plt.title("After biclustering; rearranged to show biclusters") plt.matshow(np.outer( np.sort(model.row_labels_) + 1, np.sort(model.column_labels_) + 1), cmap=plt.cm.Blues) plt.title("Checkerboard structure of rearranged data") return model
def test_perfect_checkerboard(): model = SpectralBiclustering(3, svd_method="arpack", random_state=0) S, rows, cols = make_checkerboard((30, 30), 3, noise=0, random_state=0) model.fit(S) assert_equal(consensus_score(model.biclusters_, (rows, cols)), 1) S, rows, cols = make_checkerboard((40, 30), 3, noise=0, random_state=0) model.fit(S) assert_equal(consensus_score(model.biclusters_, (rows, cols)), 1) S, rows, cols = make_checkerboard((30, 40), 3, noise=0, random_state=0) model.fit(S) assert_equal(consensus_score(model.biclusters_, (rows, cols)), 1)
def Spectral_BiClustering(M, args): '''Function to perform bipartite clustering''' # Create model try: if args.arpack: model = SpectralBiclustering( n_clusters=args.nClusters, svd_method='arpack') else: model = SpectralBiclustering( n_clusters=args.nClusters) except: print '-r 1 may cause problems when svd_method has been set to arpack' print('Running biclustering') model.fit(M.tocsc()) print('Biclustering done') # Fit to data # fit_data = M[np.argsort(model.row_labels_)] # fit_data = fit_data[:, np.argsort(model.column_labels_)] fit_data = M.tocoo() fit_data.row = invert_permutation(np.argsort(model.row_labels_))[fit_data.row] fit_data.col = invert_permutation(np.argsort(model.column_labels_))[fit_data.col] save_clusters(model, fit_data, args, '_BiClustering', 1) return model, fit_data
def test_perfect_checkerboard(): raise SkipTest("This test is failing on the buildbot, but cannot" " reproduce. Temporarily disabling it until it can be" " reproduced and fixed.") model = SpectralBiclustering(3, svd_method="arpack", random_state=0) S, rows, cols = make_checkerboard((30, 30), 3, noise=0, random_state=0) model.fit(S) assert_equal(consensus_score(model.biclusters_, (rows, cols)), 1) S, rows, cols = make_checkerboard((40, 30), 3, noise=0, random_state=0) model.fit(S) assert_equal(consensus_score(model.biclusters_, (rows, cols)), 1) S, rows, cols = make_checkerboard((30, 40), 3, noise=0, random_state=0) model.fit(S) assert_equal(consensus_score(model.biclusters_, (rows, cols)), 1)
def _get_clusters_using_spectrals(corrarr, n_clusters=5, mode='co'): if mode=='co': model = SpectralCoclustering(n_clusters, random_state=0) model.fit(corrarr) indices = np.arange(corrarr.columns.size) clusters = [indices[x].tolist() for x in model.columns_] return clusters elif mode=='bi': model = SpectralBiclustering(n_clusters, random_state=0) model.fit(corrarr) indices = np.arange(corrarr.columns.size) clusters = [indices[x].tolist() for x in model.columns_] repetition_start = clusters[1:].index(clusters[0]) + 1 return clusters[:repetition_start] else: raise("Mode wrong?")
def test_spectral_biclustering(): # Test Kluger methods on a checkerboard dataset. S, rows, cols = make_checkerboard((30, 30), 3, noise=0.5, random_state=0) non_default_params = {'method': ['scale', 'log'], 'svd_method': ['arpack'], 'n_svd_vecs': [20], 'mini_batch': [True]} for mat in (S, csr_matrix(S)): for param_name, param_values in non_default_params.items(): for param_value in param_values: model = SpectralBiclustering( n_clusters=3, n_init=3, init='k-means++', random_state=0, ) model.set_params(**dict([(param_name, param_value)])) if issparse(mat) and model.get_params().get('method') == 'log': # cannot take log of sparse matrix with pytest.raises(ValueError): model.fit(mat) continue else: model.fit(mat) assert model.rows_.shape == (9, 30) assert model.columns_.shape == (9, 30) assert_array_equal(model.rows_.sum(axis=0), np.repeat(3, 30)) assert_array_equal(model.columns_.sum(axis=0), np.repeat(3, 30)) assert consensus_score(model.biclusters_, (rows, cols)) == 1 _test_shape_indices(model)
def cocluster(self, mx, blockdiag=False): logging.info('Co-clustering Tade..') if blockdiag: logging.info('blockdiag') clusser = SpectralCoclustering(n_jobs=-1) else: # checkerboard logging.info('checkerboard') clusser = SpectralBiclustering(n_jobs=-1, n_clusters=(4, 3)) #n_clusters=3, svd_method='randomized', clusser.fit(mx) logging.info('Argsorting mx rows..') mx = mx[np.argsort(clusser.row_labels_)] self.prev = self.prev[np.argsort(clusser.row_labels_)] logging.info('Argsorting mx cases..') mx = mx[:, np.argsort(clusser.column_labels_)] self.case = self.case[np.argsort(clusser.column_labels_)] return mx
def test_spectral_biclustering(): # Test Kluger methods on a checkerboard dataset. S, rows, cols = make_checkerboard((30, 30), 3, noise=0.5, random_state=0) non_default_params = {'method': ['scale', 'log'], 'svd_method': ['arpack'], 'n_svd_vecs': [20], 'mini_batch': [True]} for mat in (S, csr_matrix(S)): for param_name, param_values in non_default_params.items(): for param_value in param_values: model = SpectralBiclustering( n_clusters=3, n_init=3, init='k-means++', random_state=0, ) model.set_params(**dict([(param_name, param_value)])) if issparse(mat) and model.get_params().get('method') == 'log': # cannot take log of sparse matrix assert_raises(ValueError, model.fit, mat) continue else: model.fit(mat) assert_equal(model.rows_.shape, (9, 30)) assert_equal(model.columns_.shape, (9, 30)) assert_array_equal(model.rows_.sum(axis=0), np.repeat(3, 30)) assert_array_equal(model.columns_.sum(axis=0), np.repeat(3, 30)) assert_equal(consensus_score(model.biclusters_, (rows, cols)), 1) _test_shape_indices(model)
##0.780023781213 ############################################################################### ## Draw dendrogram Z = linkage(data, 'ward') plt.figure(figsize=(25, 10)) plt.title('Hierarchical Clustering Dendrogram') plt.xlabel('sample index') plt.ylabel('distance') dendrogram( Z, leaf_rotation=90., # rotates the x axis labels leaf_font_size=8., # font size for the x axis labels labels=np.array(authors) ) plt.show() ############################################################################### ## Biclustering data = data.astype('float') bc = SpectralBiclustering(n_clusters=(n_authors,5)) bc.fit(data) ## TODO : sort the rows and columns bc_data = data[np.argsort(bc.row_labels_)] bc_data = bc_data[:, np.argsort(bc.column_labels_)] ## How to annotate the words? plt.matshow(data, cmap = plt.cm.Blues) plt.title("Original dataset") plt.matshow(bc_data, cmap = plt.cm.Blues) plt.title("After biclustering; rearrange to show biclusters")
l = int(sys.argv[4]) output_mat_name = sys.argv[5] tfidf = load_sparse_mat(mat_name,mat_filename).astype(float32) data = tfidf.A im = plt.matshow(data, aspect='auto', cmap='jet') vmax = amax(data) vmin = amin(data) plt.clim(vmin,vmax) plt.colorbar(im) m,n = tfidf.shape print("Matrix dimensions: ",m,"x",n) print("Row clusters:",k) print("Column clusters:",l) start = time.time() model = SpectralBiclustering(n_clusters=(k,l),random_state=0) model.fit(tfidf) end = time.time() print("Biclustering process takes",int(round(end-start)),"seconds") fit_data = data[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.column_labels_)] im = plt.matshow(fit_data, aspect='auto', cmap='jet') plt.clim(vmin,vmax) plt.colorbar(im) im = plt.matshow(np.outer(np.sort(model.row_labels_) + 1, np.sort(model.column_labels_) + 1), cmap='jet',aspect='auto') plt.clim(vmin,vmax) plt.colorbar(im) plt.title("Checkerboard structure of rearranged data") plt.show()
spectral_model.fit(z_score) fit_data = z_score[np.argsort(spectral_model.row_labels_)] fit_data = fit_data[:, np.argsort(spectral_model.column_labels_)] plt.matshow(fit_data, cmap=plt.cm.Blues) plt.savefig('z_score_raw_biclustering_all_%is.svg' % (n_intervals / 1000)) def dump_raw_z_scores(): np.array(stats.zscore(np.array(matrix))).dump('raw_z_npdump.dump') if __name__ == '__main__': <<<<<<< HEAD z_scores = np.load('raw_z_npdump.dump') plt.title('Z Score Biclustering') spectral_model = SpectralBiclustering() spectral_model.fit(z_scores) fit_data = z_scores[np.argsort(spectral_model.row_labels_)] fit_data = fit_data[:, np.argsort(spectral_model.column_labels_)] plt.matshow(fit_data, cmap=plt.cm.Blues) plt.savefig('z_score_bicluster.svg') ======= # plot_biclustering_with_pearson(30000000000) # plot_biclustering_raw_data(60000) # plot_biclustering_raw_data(60000, t=True) # plot_coclusters_raw_data(60000) # plot_coclusters_raw_data(60000, t=True) # plot_biclusters_n_intervals(15000) dump_raw_z_scores() print(z_score) >>>>>>> 51502dc598c9e79326407b5d15302c706bb6cdf2
# train_features[user]['average_set_score'] = sum_set_scores / float(num_sets) # # s average score # sum_s_scores = 0 # for i in range(0, num_sets): # sum_s_scores += grades_rowdict[key]['s' + str(i)] # train_features[user]['average_s_score'] = sum_set_scores / float(num_sets) # # rest of the features # train_features[user]['course_score'] = grades_rowdict[key]['course'] # train_features[user]['final_exam_score'] = grades_rowdict[key]['final'] # train_features[user]['hw_score'] = grades_rowdict[key]['hw'] # train_features[user]['letter'] = grades_rowdict[key]['letter'] # train_features[user]['demerit'] = grades_rowdict[key]['demerit'] # else: # pass # MACHINE LEARNING CLUSTERING import numpy as np from sklearn.cluster import KMeans, DBSCAN kmeans = KMeans(init='k-means++', n_clusters=5, n_init=10) kmeans.fit(train_features) from sklearn.cluster.bicluster import SpectralBiclustering model = SpectralBiclustering(n_clusters=5, method='log', random_state=0) model.fit(train_features) train_features.loc['bf7aa87b-444a-4eff-9f81-b4078e6dccd3'] model.row_labels_
except: print('FAYOL!') media_id_num = picsdict[media_id] m[media_id_num, usersdict[db[media_id][3]]] = True for user in temp: try: m[media_id_num, usersdict[user.username]] = True except: # print(':3 ', user.username) other_users.add(user.username) import pickle pickle.dump( m, open( "save.p", "wb" ), protocol = 2 ) m = pickle.load( open( "save.p", "rb" ) ) import numpy as np from matplotlib import pyplot as plt plt.matshow(m, cmap=plt.cm.Blues) from sklearn.cluster.bicluster import SpectralBiclustering from sklearn.metrics import consensus_score model = SpectralBiclustering(method='bistochastic', n_jobs = -1) model.fit(m) fit_data = m[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.column_labels_)] plt.matshow(fit_data, cmap=plt.cm.Blues) plt.title("After biclustering; rearranged to show biclusters")
from sklearn.manifold import TSNE # C1 = sklearn.cluster.AgglomerativeClustering(n_clusters=5, affinity='precomputed') # # R1 = C1.fit_predict(Gram) # n = len(Gram) Di = np.reshape(np.diag(Gram),(n,1)) M = Di.dot(np.ones((1,n))) D = M + M.T - 2*Gram C2 = AffinityPropagation(affinity='precomputed') C1 = KMeans(n_clusters = 5) C3 = AgglomerativeClustering(n_clusters=5, affinity='precomputed',linkage='average') C4 = SpectralClustering(n_clusters=5,affinity='precomputed') C5 = SpectralBiclustering(n_clusters=(5,5)) R1 = C1.fit_predict(D) R2 = C2.fit_predict(D) R3 = C3.fit_predict(D) R4 = C4.fit_predict(Gram +11) R5 = C5.fit(D) print(R4) modèle = TSNE(n_components=2,metric='precomputed') Trans = modèle.fit_transform(D) G_ACP = ACP(Gram,precomputed=True) trace_ACP(G_ACP,[10]*5)
mplpyplot.show() # nodebox section end n_clusters = (4, 3) data, rows, columns = make_checkerboard( shape=(300, 300), n_clusters=n_clusters, noise=10, shuffle=False, random_state=0) plt.matshow(data, cmap=plt.cm.Blues) plt.title("Original dataset") data, row_idx, col_idx = sg._shuffle(data, random_state=0) plt.matshow(data, cmap=plt.cm.Blues) plt.title("Shuffled dataset") model = SpectralBiclustering(n_clusters=n_clusters, method='log', random_state=0) model.fit(data) score = consensus_score(model.biclusters_, (rows[:, row_idx], columns[:, col_idx])) print("consensus score: {:.1f}".format(score)) fit_data = data[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.column_labels_)] plt.matshow(fit_data, cmap=plt.cm.Blues) plt.title("After biclustering; rearranged to show biclusters") plt.matshow(np.outer(np.sort(model.row_labels_) + 1, np.sort(model.column_labels_) + 1), cmap=plt.cm.Blues)