def Spectral_CoClustering(args): '''Function to perform bipartite clustering''' # Create model try: if args.arpack: model = SpectralCoclustering( n_clusters=args.nClusters, svd_method='arpack') else: model = SpectralCoclustering( n_clusters=args.nClusters) except: print '-r 1 may cause problems when svd_method has been set to arpack' print('Running coclustering') model.fit(args.M.tocsc()) print('Coclustering done') # Fit to data # fit_data = args.M[np.argsort(model.row_labels_)] # fit_data = fit_data[:, np.argsort(model.column_labels_)] fit_data = args.M.tocoo() fit_data.row = invert_permutation(np.argsort(model.row_labels_))[fit_data.row] fit_data.col = invert_permutation(np.argsort(model.column_labels_))[fit_data.col] save_clusters(model, fit_data, args, '_CoClustering') return model, fit_data
def test_spectral_coclustering(): # Test Dhillon's Spectral CoClustering on a simple problem. param_grid = { 'svd_method': ['randomized', 'arpack'], 'n_svd_vecs': [None, 20], 'mini_batch': [False, True], 'init': ['k-means++'], 'n_init': [10], 'n_jobs': [1] } random_state = 0 S, rows, cols = make_biclusters((30, 30), 3, noise=0.5, random_state=random_state) S -= S.min() # needs to be nonnegative before making it sparse S = np.where(S < 1, 0, S) # threshold some values for mat in (S, csr_matrix(S)): for kwargs in ParameterGrid(param_grid): model = SpectralCoclustering(n_clusters=3, random_state=random_state, **kwargs) model.fit(mat) assert_equal(model.rows_.shape, (3, 30)) assert_array_equal(model.rows_.sum(axis=0), np.ones(30)) assert_array_equal(model.columns_.sum(axis=0), np.ones(30)) assert_equal(consensus_score(model.biclusters_, (rows, cols)), 1) _test_shape_indices(model)
def bi_clustering(data, args): print 'clustering...' # max_val = np.max(np.max(data)) # data = -np.exp(data / data.std()) max_val = np.max(np.max(data)) data[data == 0] = max_val data = data / max_val model = SpectralCoclustering(n_clusters=args.k, svd_method='arpack') model.fit(data) np.savetxt(args.o, model.row_labels_, fmt="%d", newline="\n") fit_data = data[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.column_labels_)] if not args.plot: return plt.matshow(shuffle(data), cmap=plt.cm.Blues) plt.title("Org dataset") plt.matshow(fit_data, cmap=plt.cm.Blues) plt.title("After biclustering; rearranged to show biclusters") plt.show()
def run(self, data): bc = SpectralCoclustering(n_clusters=(self.n_gene_classes, self.n_classes)) bc.fit(data) gene_clusters = bc.row_labels_ cell_clusters = bc.column_labels_ return cell_clusters
def main(): origin = open('10k.txt', 'r') lines = origin.readlines() x = [] label = [] for l in lines: l = l.split(',') ip1 = l[2].split('.') ip2 = l[3].split('.') d = [datetime.fromtimestamp(int(l[1][0:11])).hour, int("%02x%02x%02x%02x"%(int(ip1[0]),int(ip1[1]),int(ip1[2]),int(ip1[3])),16), int("%02x%02x%02x%02x" % (int(ip2[0]),int(ip2[1]),int(ip2[2]),int(ip2[3])),16)] + l[4:6] + l[7:10] x.append(d) data = np.array(x, dtype='float32') model = SpectralCoclustering(n_clusters=5) model.fit(data) print model.rows_ for i in range(5): print "Cluster" + str(i) + ':' for j in range(10000): if model.rows_[i][j]: print j, print ' '
def update_bicluster(batch_df, task_df, compound_df, mode='RobustMT', K=5): if mode == 'RobustMT': n_tasks = task_df.shape[1] - 1 elif mode == 'ST': n_tasks = 1 elif mode == 'MT': n_tasks = task_df.shape[1] if not mode == 'ST': # cocluster of the minibatch predictive matrix X = preprocessing.scale(np.matrix(batch_df)[:, 0:n_tasks]) cocluster = SpectralCoclustering(n_clusters=K, random_state=0) cocluster.fit(X) batch_df['batch_label'] = cocluster.row_labels_ else: rank_x = batch_df[batch_df.columns[0]].rank().tolist() groups = pd.qcut(rank_x, K, duplicates='drop') batch_df['batch_label'] = groups.codes # generate color hex for batch_label lut = dict(zip(batch_df['batch_label'].unique(), Category20_20)) batch_df['batch_label_color'] = batch_df['batch_label'].map(lut) # generate color hex for compound_df lut2 = dict(zip(batch_df['Label_id'], batch_df['batch_label_color'])) compound_df['batch_label_color'] = compound_df['label'].map(lut2) lut22 = dict(zip(batch_df['Label_id'], batch_df['batch_label'])) compound_df['batch_label'] = compound_df['label'].map(lut22) groups = pd.qcut(compound_df['label'].tolist(), len(Category20b_20), duplicates='drop') c = [Category20b_20[xx] for xx in groups.codes] compound_df['label_color'] = c return batch_df, task_df, compound_df
def plot_biclusters(): co_grid = ParameterGrid( {'n_clusters': np.arange(2, 10, 1), 'n_init': [20]} ) _y = pd.read_csv('./../../data_source/to_analysis/original_images/dfs_original_images.csv', index_col=0) y_orig = np.squeeze(_y.values) X_orig = pd.read_csv('./../../data_source/to_analysis/original_images/all_features_original_images.csv', index_col=0) scaler = StandardScaler() X_orig_std = scaler.fit_transform(X_orig.values) #_run_experiment(co_grid, X_orig_std) df_avg_co_scores = pd.read_csv('bic_scores.csv', index_col=0) best_co_config = co_grid[ np.argmin(df_avg_co_scores.loc[:, 'tvr'].values) - 1 ] print(best_co_config, min(df_avg_co_scores.loc[:, 'tvr'].values)) orig_co_model = SpectralCoclustering(random_state=0, svd_method='arpack') orig_co_model.set_params(**best_co_config) orig_co_model.fit(X_orig_std) #plt.figure() #_plot_tve(df_avg_co_scores, co_grid) plt.figure() _plot_bicmaps(X_orig_std, best_co_config)
def correlation_matrix(df): sns.set(style='white', font_scale=.9) clusters = 4 pearson = df.drop(['asset', 'unixtime'], axis=1).corr(method='pearson') clust = SpectralCoclustering(n_clusters=clusters, random_state=0) clust.fit(pearson) pearson = pearson.iloc[np.argsort(clust.row_labels_)[::-1], np.argsort(clust.column_labels_)] grid = dict(width_ratios=[1.5, pearson.shape[1]]) fig, axs = plt.subplots(1, 2, figsize=(10, 8), gridspec_kw=grid) sns.heatmap(data=np.sort(clust.row_labels_)[::-1].reshape(-1, 1), ax=axs[0], cbar=False, linewidths=.005, cmap=sns.color_palette('Spectral')) axs[0].set(xticks=(), yticks=()) sns.heatmap(data=pearson, cmap=sns.diverging_palette(220, 10, n=11), linewidths=.005, cbar_kws={'shrink': .75}, vmax=1, vmin=-1, ax=axs[1]) axs[1].set_xticklabels(pearson.columns, rotation='vertical') axs[1].set_yticklabels(pearson.index, rotation='horizontal') fig.suptitle(f'Variable Correlation Matrix in {clusters} Clusters', fontsize=20) fig.tight_layout(w_pad=.5, rect=(.03, 0, 1, .95)) fig.savefig('reports/correlation_matrix.png')
def bicluster(*cotables): table = cotables[0] model = SpectralCoclustering(n_clusters=table.shape[1], random_state=0) model.fit(table.as_matrix()) return [ cotable.iloc[np.argsort(model.row_labels_), np.argsort(model.column_labels_)] for cotable in cotables ]
def spectral_co_cluster(data, n_clusters, para_jobs=1, random_state=None): from sklearn.cluster.bicluster import SpectralCoclustering model = SpectralCoclustering(n_clusters, random_state=random_state, n_jobs=para_jobs) model.fit(data) row_labels = model.row_labels_ col_labels = model.column_labels_ return row_labels, col_labels
def spectral_coclustering(cls, *args): """ Wrapper method for the spectral_coclustering algorithm :param args: the arguments to be sent to the sci-kit implementation :return: returns the Biclustering object """ model = SpectralCoclustering(*args) return cls(model)
def get_clusters(data): coclusters = SpectralCoclustering(n_clusters=5, random_state=0) coclusters.fit(data) word_clusters = [] hidden_clusters = [] for i in range(5): wc = coclusters.get_indices(i)[0] hc = coclusters.get_indices(i)[1] word_clusters.append(wc.tolist()) hidden_clusters.append(hc.tolist()) return word_clusters, hidden_clusters
def _plot_bicmaps(X_orig_std, best_co_config): # Train model with best config. orig_co_model = SpectralCoclustering(random_state=0, svd_method='arpack') orig_co_model.set_params(**best_co_config) orig_co_model.fit(X_orig_std) orig_co_row_sorted = X_orig_std[np.argsort(orig_co_model.row_labels_), :] orig_co_fit_data = orig_co_row_sorted[:, np.argsort(orig_co_model.column_labels_)] hmap = sns.heatmap( orig_co_fit_data, robust=True, cmap=plt.cm.viridis, fmt='f', vmin=np.min(orig_co_fit_data), vmax=np.max(orig_co_fit_data), cbar=False ) coords = bic_coords(orig_co_model, best_co_config['n_clusters']) for num in coords.index: plt.plot( (coords.loc[num, ['x1', 'x2', 'x2', 'x1', 'x1']]), (coords.loc[num, ['y1', 'y1', 'y2', 'y2', 'y1']]), c='darkred' ) plt.ylabel('Patients') plt.xlabel('Features') plt.yticks([], []) plt.xticks([], []) ax_divider = make_axes_locatable(hmap) cax = ax_divider.append_axes('right', size='3%', pad='2%') colorbar.colorbar( hmap.get_children()[0], cax=cax, orientation='vertical' ) #cax.xaxis.set_label_text('AUC', fontname='Sans') #cax.xaxis.set_label_position('top') cbar_ticks = np.linspace( np.nanmin(orig_co_fit_data), np.nanmax(orig_co_fit_data), 6 ) cax.yaxis.set_ticks(cbar_ticks) cax.yaxis.set_ticklabels([f'{num:.01f}' for num in cbar_ticks]) plt.savefig( '../biclustering/bic_map_original_images.pdf', bbox_inches='tight', transparent=True, dpi=CONFIG.DPI, )
def plot_coclusters_raw_data(time_ms, t=False): # take the transpose of sliced matrix if t: channels_data = slice_matrix(matrix, time_ms) else: channels_data = slice_matrix(matrix, time_ms) print len(channels_data), len(channels_data[1]) z_score = stats.zscore(channels_data) plt.title('Z Score Biclustering Over %i ms' % time_ms) spectral_model = SpectralCoclustering() spectral_model.fit(z_score) fit_data = z_score[np.argsort(spectral_model.row_labels_)] fit_data = fit_data[:, np.argsort(spectral_model.column_labels_)] plt.matshow(fit_data, cmap=plt.cm.Blues) plt.savefig('z_score_raw_coclustering_all_ts_%i_T_%s.svg' % (time_ms, str(t)))
def _get_clusters_using_spectrals(corrarr, n_clusters=5, mode='co'): if mode=='co': model = SpectralCoclustering(n_clusters, random_state=0) model.fit(corrarr) indices = np.arange(corrarr.columns.size) clusters = [indices[x].tolist() for x in model.columns_] return clusters elif mode=='bi': model = SpectralBiclustering(n_clusters, random_state=0) model.fit(corrarr) indices = np.arange(corrarr.columns.size) clusters = [indices[x].tolist() for x in model.columns_] repetition_start = clusters[1:].index(clusters[0]) + 1 return clusters[:repetition_start] else: raise("Mode wrong?")
def cocluster(self, mx, blockdiag=False): logging.info('Co-clustering Tade..') if blockdiag: logging.info('blockdiag') clusser = SpectralCoclustering(n_jobs=-1) else: # checkerboard logging.info('checkerboard') clusser = SpectralBiclustering(n_jobs=-1, n_clusters=(4, 3)) #n_clusters=3, svd_method='randomized', clusser.fit(mx) logging.info('Argsorting mx rows..') mx = mx[np.argsort(clusser.row_labels_)] self.prev = self.prev[np.argsort(clusser.row_labels_)] logging.info('Argsorting mx cases..') mx = mx[:, np.argsort(clusser.column_labels_)] self.case = self.case[np.argsort(clusser.column_labels_)] return mx
def main(model): store = pd.HDFStore(model) from_ = store['from_'][0][0] to = store['to'][0][0] assert from_ == 0 trace_fpath = store['trace_fpath'][0][0] Theta_zh = store['Theta_zh'].values Psi_oz = store['Psi_sz'].values count_z = store['count_z'].values[:, 0] Psi_oz = Psi_oz / Psi_oz.sum(axis=0) Psi_zo = (Psi_oz * count_z).T Psi_zo = Psi_zo / Psi_zo.sum(axis=0) obj2id = dict(store['source2id'].values) hyper2id = dict(store['hyper2id'].values) id2obj = dict((v, k) for k, v in obj2id.items()) ZtZ = Psi_zo.dot(Psi_oz) ZtZ = ZtZ / ZtZ.sum(axis=0) L = ZtZ #ZtZ[ZtZ < (ZtZ.mean())] = 0 L[ZtZ >= 1.0 / (len(ZtZ))] = 1 L[L != 1] = 0 colormap = toyplot.color.brewer.map("Purples", domain_min=0, domain_max=1, reverse=True) print(colormap) canvas = toyplot.matrix((L.T, colormap), label="P[z' | z]", \ colorshow=False, tlabel="To z'", llabel="From")[0] #canvas.axes(ylabel='From z', xlabel='To z\'') toyplot.pdf.render(canvas, 'tmat.pdf') model = SpectralCoclustering(n_clusters=3) model.fit(L) fit_data = L[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.column_labels_)] canvas = toyplot.matrix((fit_data, colormap), label="P[z' | z']", \ colorshow=False)[0] toyplot.pdf.render(canvas, 'tmat-cluster.pdf') #AtA = Psi_oz.dot(Psi_zo) #np.fill_diagonal(AtA, 0) #AtA = AtA / AtA.sum(axis=0) store.close()
def biclustering(db): #mydata = genfromtxt('/home/fan/intern/process_db/analysis/viewtime_matrix_524.csv',dtype=None,delimiter=',',names=True,skip_header=1) df = pd.read_csv( '/home/fan/intern/process_db/analysis/viewtime_matrix_501_0.1.csv') dma = 501 #print df.head() print df.shape dev_list = df.ix[:, 0].values prog_list = df.columns.values #print type(dev_list) #print type(prog_list) df.drop(df.columns[0], axis=1, inplace=True) #df[df==0] = 1 df = df.apply(fraction, axis=1) #print df.head() #print df.values #print type(df.values) #mydata = df.values #mydata=np.delete(mydata, 0, axis=0) #mydata=np.delete(mydata, 0, axis=1) #mydata[mydata==0] = 0.01 #print 'data format is:',mydata,type(mydata) # model=SpectralCoclustering(n_clusters=5, random_state=0) #n_clusters=(1000,20) # 4*3 = 12 clusters #model = SpectralBiclustering(random_state=None) model = SpectralCoclustering(n_clusters=10) model.fit(df) #fit_data=mydata[np.argsort(model.row_labels_)] #fit_data=fit_data[:,np.argsort(model.column_labels_)] #plt.matshow(fit_data[0:40],cmap=plt.cm.Blues) # plt.show() print model.get_params() for i in range(0, 5): print 'Size of one cluster:', model.get_shape(i) indices = model.get_indices(i) #print indices[1] print prog_list[indices[1]] print model.get_submatrix(i, df.values) dev_in_cluster = dev_list[indices[0]] #print type(dev_in_cluster) print 'number of devices within this cluster:', len(dev_in_cluster) get_income(db, dma, dev_in_cluster.tolist())
def cluster_ex_by_feature_matrix(sub_ex_by_feat_mat, plot_file): if sub_ex_by_feat_mat.shape[0] > 50000: print "Matrix too large to be efficient, pleased reduce number of examples" # Subset down to motifs that are used plot_df = sub_ex_by_feat_mat[:, np.apply_along_axis( np.max, 0, sub_ex_by_feat_mat.toarray() ) != 0] # for numpy array plot_df = sub_ex_by_feat_mat_1[np.apply_along_axis( lambda row: (row != 0).sum(), 1, sub_ex_by_feat_mat_1.toarray()) > 10, :] plot_df = plot_df[:, np.apply_along_axis(lambda column: (column != 0).sum(), 0, sub_ex_by_feat_mat_1.toarray()) > 50] # for pandas plot_df = sub_ex_by_feat_df2.ix[ sub_ex_by_feat_df2.apply(lambda row: (row != 0).sum(), 1) > 10, :] plot_df = plot_df.ix[:, plot_df.apply(lambda row: (row != 0).sum(), 0) > 50] plot_df = sub_ex_by_feat_df2 np.apply_along_axis(lambda column: (column != 0).sum(), 0, sub_ex_by_feat_mat_1.toarray()) model = SpectralCoclustering(n_clusters=50) model.fit(plot_df) # fits for 50K fit_data = plot_df.ix[np.argsort(model.row_labels_)] fit_data = fit_data.ix[:, np.argsort(model.column_labels_)] plt.figure() plt.matshow(fit_data.ix[0:500, ], cmap=plt.cm.YlGnBu, aspect='auto') plt.savefig(plot_file) print "DONE: biclustering plot here: {0}".format(plot_file) return "pretty picture"
def _run_experiment(co_grid, X_orig_std): np.random.seed(seed=0) random_states = np.random.choice(40, size=40) avg_co_scores = {} for num, co_param_config in enumerate(co_grid): orig_co_scores = [] for random_state in random_states: orig_co_model = SpectralCoclustering(random_state=random_state, svd_method='arpack') # NOTE: Outputs a TVE score. orig_co_clusters = biclusters(orig_co_model, X_orig_std, co_param_config) orig_co_scores.append(orig_co_clusters.external_metrics.values) avg_co_scores[num] = np.nanmean(orig_co_scores, axis=0) avg_orig_co_scores = [] for num, scores in enumerate(avg_co_scores.values()): avg_orig_co_scores.append(np.mean(scores, axis=0)) df_avg_co_scores = pd.DataFrame(avg_orig_co_scores, columns=['tvr']) df_avg_co_scores.index.name = 'ConfigID' df_avg_co_scores.to_csv('bic_scores.csv')
def Block_diagonal(input_path,top_sd,n_clusters,output_path): ###input data input_dat=pd.read_csv(input_path,index_col=0,sep='\t',comment='#') ### get index and sample name # get_index = input_dat.index.astype(str)+'_'+input_dat.ix[:,0].astype(str)+'_'+\ # input_dat.ix[:,1].astype(str)+'_'+input_dat.ix[:,2].astype(str) # get_samp_name = input_dat.columns[3:] pro_dat = input_dat.fillna(0) # pro_dat = pro_dat.ix[:,3:] # pro_dat.index = get_index # pro_dat.columns = get_samp_name # pro_dat = 2**pro_dat-1 df_sd = pro_dat.apply(np.std,axis=1) df_sd_sort = df_sd.sort_values(ascending = False) df_sd_sort_top = df_sd_sort.ix[:int(len(df_sd_sort)*top_sd)] pro_dat = pro_dat.ix[df_sd_sort_top.index,:] sd_index = pro_dat.index sd_sample_names = pro_dat.columns #plt.matshow(pro_dat, cmap=plt.cm.Blues) #plt.title("Original dataset") ### model model = SpectralCoclustering(n_clusters=n_clusters, random_state=0) model.fit(pro_dat) pro_dat = np.array(pro_dat) fit_data = pro_dat[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.column_labels_)] ### output the model fitting data fit_data = pd.DataFrame(fit_data) fit_data.index = sd_index[np.argsort(model.row_labels_)] fit_data.columns = sd_sample_names[np.argsort(model.column_labels_)] out_fit_data_path = os.path.join(output_path,'fit_data.csv') fit_data.to_csv(out_fit_data_path) ### output image fig = plt.figure(figsize=(20,40)) ax = fig.add_subplot(111) ax.matshow(fit_data, cmap=plt.cm.Blues) #cax = ax.matshow(pro_dat, interpolation='nearest') #fig.colorbar(cax) ax.set_title("After biclustering; rearranged to show biclusters") # ax.set_xticklabels(fit_data.columns ) # ax.set_yticklabels(fit_data.index) out_img_path = os.path.join(output_path,'bicluster.png') fig.savefig(out_img_path) ### output module a11 = pd.Series(model.row_labels_) b11 = pd.Series(model.column_labels_) c11 = a11.groupby(a11).size() c22 = b11.groupby(b11).size() d11 = pd.DataFrame(a11.sort_values().values,fit_data.index.values) d22 = pd.DataFrame(b11.sort_values().values,fit_data.columns.values) d11.columns = ['cpg_module'] d22.columns = ['sample_module'] out_module_path = os.path.join(output_path,'output.xlsx') writer = pd.ExcelWriter(out_module_path) d11.to_excel(writer,'Sheet1') d22.to_excel(writer,'Sheet2') writer.save() # print("\n") print("cpg module:") print(c11) print("\n") print("sample module:") print(c22)
# exclude 'comp.os.ms-windows.misc' categories = ['alt.atheism', 'comp.graphics', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc'] newsgroups = fetch_20newsgroups(categories=categories) y_true = newsgroups.target vectorizer = TfidfVectorizer(stop_words='english', min_df=5, tokenizer=number_aware_tokenizer) cocluster = SpectralCoclustering(n_clusters=len(categories), svd_method='arpack', random_state=0) kmeans = MiniBatchKMeans(n_clusters=len(categories), batch_size=20000, random_state=0) print("Vectorizing...") X = vectorizer.fit_transform(newsgroups.data) print("Coclustering...") start_time = time() cocluster.fit(X) y_cocluster = cocluster.row_labels_ print("Done in {:.2f}s. V-measure: {:.4f}".format( time() - start_time, v_measure_score(y_cocluster, y_true))) print("MiniBatchKMeans...")
os.mkdir('solution') #n_clusters = (3, 2) n_clusters = 20 arq = open('dados_v2.txt') dados = np.array([map(float, a.split('\t')[:-1]) for a in arq.readlines()]) plt.matshow(zip(*dados), cmap=cm.PiYG) plt.title("Original dataset") pl.savefig('solution/original.png', bbox_inches=0) #model = SpectralBiclustering(n_clusters=n_clusters, method='log', random_state=0) model = SpectralCoclustering(n_clusters=n_clusters, svd_method='arpack', random_state=0) model.fit(dados) fit_data = dados[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.column_labels_)] plt.matshow(zip(*fit_data), cmap=cm.PiYG) pl.savefig('solution/biclustered.png', bbox_inches=0) plt.title("After biclustering; rearranged to show biclusters") plt.matshow(zip(*np.outer( np.sort(model.row_labels_) + 1, np.sort(model.column_labels_) + 1)), cmap=plt.cm.PiYG) plt.title("Checkerboard structure of rearranged data") pl.savefig('solution/biclustered and rearranged.png', bbox_inches=0)
print(coOccurencesMatrix) print(coOccurencesMatrix.shape) hashtags = vectorizer.get_feature_names() hashtags = np.array(hashtags) coOccurencesMatrix = np.where(coOccurencesMatrix == 0, 0, coOccurencesMatrix) #coOccurencesMatrix = StandardScaler().fit_transform(coOccurencesMatrix) print(coOccurencesMatrix) import copy coOccurencesMatrix2 = copy.deepcopy(coOccurencesMatrix) coOccurencesMatrix2 = np.corrcoef(coOccurencesMatrix2) coOccurencesMatrix = np.corrcoef(coOccurencesMatrix) nbClusters = 40 model = SpectralCoclustering(n_clusters=nbClusters, random_state=1) model.fit(coOccurencesMatrix) print("fit") print(coOccurencesMatrix) fit_data = coOccurencesMatrix[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.column_labels_)] hashtagsrow = hashtags[np.argsort(model.row_labels_)] hashtagscolumns = hashtags[np.argsort(model.column_labels_)] print("rowlavels") print(model.row_labels_) print("columnlzbels") print(model.column_labels_) print("hashtags") print(hashtags) print(fit_data.shape) print(fit_data)
plt.figure(figsize=(15, 25)) sns.heatmap(cluster_counts / cluster_counts.sum(1)[:, np.newaxis], yticklabels=cluster_cell_types, vmin=0, vmax=1, linewidths=0.5) plt.xlabel('UNCURL clusters') plt.ylabel('Seurat clusters') plt.title('SCH Cerebellum Clusters') plt.savefig('uncurl_vs_seurat_clusters.png', dpi=200) # do a biclustering from sklearn.cluster.bicluster import SpectralCoclustering spec = SpectralCoclustering(18) cluster_counts_subset = np.vstack( [cluster_counts[:31, :], cluster_counts[32:, :]]) spec.fit(cluster_counts + 0.0001) row_labels = spec.row_labels_ column_labels = spec.column_labels_ row_order = np.argsort(row_labels) col_order = np.argsort(column_labels) #row_labels = row_labels[row_order] #col_labels = column_labels[col_order] cluster_counts_reordered = cluster_counts[row_order, :] cluster_counts_reordered = cluster_counts_reordered[:, col_order] cluster_cell_types_2 = np.array(
def test_co_clustering(): import numpy as np import nibabel as nb from matplotlib import pyplot as plt import sklearn as sk from sklearn.datasets import make_biclusters from sklearn.datasets import samples_generator as sg from sklearn.cluster.bicluster import SpectralCoclustering from sklearn.metrics import consensus_score # REAL DATA subject_file = '/Users/aki.nikolaidis/Desktop/NKI_SampleData/A00060280/3mm_bandpassed_demeaned_filtered_antswarp.nii.gz' roi_mask_file = home + '/git_repo/basc/masks/BG_3mm.nii.gz' roi2_mask_file = home + '/git_repo/basc/masks/yeo2_3mm.nii.gz' data = nb.load(subject_file).get_data().astype('float32') print('Data Loaded') print('Setting up NIS') roi_mask_file_nb = nb.load(roi_mask_file) roi2_mask_file_nb = nb.load(roi2_mask_file) roi_mask_nparray = nb.load(roi_mask_file).get_data().astype( 'float32').astype('bool') roi2_mask_nparray = nb.load(roi2_mask_file).get_data().astype( 'float32').astype('bool') roi1data = data[roi_mask_nparray] roi2data = data[roi2_mask_nparray] #add code that uploads the roi1data and roi2data, divides by the mean and standard deviation of the timeseries roi1data = sk.preprocessing.normalize(roi1data, norm='l2') roi2data = sk.preprocessing.normalize(roi2data, norm='l2') dist_btwn_data_1_2 = np.array( sp.spatial.distance.cdist(roi1data, roi2data, metric='correlation')) sim_btwn_data_1_2 = 1 - dist_btwn_data_1_2 sim_btwn_data_1_2[np.isnan(sim_btwn_data_1_2)] = 0 sim_btwn_data_1_2[sim_btwn_data_1_2 < 0] = 0 sim_btwn_data_1_2 = sim_btwn_data_1_2 + (np.random.rand( len(sim_btwn_data_1_2), len(sim_btwn_data_1_2[1, :]))) / 100 sim_btwn_data_1_2[sim_btwn_data_1_2 > 1] = 1 sum(sum(sim_btwn_data_1_2 == np.inf)) sum(sum(sim_btwn_data_1_2 == np.nan)) model = SpectralCoclustering(n_clusters=5, random_state=0, n_init=100) model.fit(sim_btwn_data_1_2) fit_data = sim_btwn_data_1_2[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.column_labels_)] plt.matshow(fit_data, cmap=plt.cm.Blues) plt.title("After biclustering; rearranged to show biclusters") plt.show() #SIMULATION DATA import numpy as np from matplotlib import pyplot as plt from sklearn.datasets import make_biclusters from sklearn.datasets import samples_generator as sg from sklearn.cluster.bicluster import SpectralCoclustering from sklearn.metrics import consensus_score #Creating Simulated Data data, rows, columns = make_biclusters(shape=(300, 100), n_clusters=5, noise=5, shuffle=False, random_state=0) plt.matshow(data, cmap=plt.cm.Blues) plt.title("Original dataset") data, row_idx, col_idx = sg._shuffle(data, random_state=0) plt.matshow(data, cmap=plt.cm.Blues) plt.title("Shuffled dataset") #Creating Model model = SpectralCoclustering(n_clusters=5, random_state=0) model.fit(data) score = consensus_score(model.biclusters_, (rows[:, row_idx], columns[:, col_idx])) print("consensus score: {:.3f}".format(score)) fit_data = data[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.column_labels_)] plt.matshow(fit_data, cmap=plt.cm.Blues) plt.title("After biclustering; rearranged to show biclusters") plt.show() #################################################################### #################################################################### from sklearn import cluster import scipy as sp import time from sklearn import cluster, datasets import numpy as np from matplotlib import pyplot as plt from sklearn.datasets import make_biclusters from sklearn.datasets import samples_generator as sg from sklearn.cluster.bicluster import SpectralCoclustering from sklearn.metrics import consensus_score data1 = generate_simple_blobs(27) data2 = generate_simple_blobs(27) data2 = data2[0:150, :] print("Calculating Cross-clustering") print("Calculating pairwise distances between areas") dist_btwn_data_1_2 = np.array( sp.spatial.distance.cdist(roi1data, roi2data, metric='correlation')) sim_btwn_data_1_2 = 1 - dist_btwn_data_1_2 sim_btwn_data_1_2[sim_btwn_data_1_2 < 0] = 0 co_cluster = cluster.SpectralCoclustering() co_cluster.fit(sim_btwn_data_1_2) score = consensus_score(co_cluster.biclusters_, (rows[:, row_idx], columns[:, col_idx])) print("consensus score: {:.3f}".format(score)) fit_data = data[np.argsort(co_cluster.row_labels_)] fit_data = fit_data[:, np.argsort(co_cluster.column_labels_)] plt.matshow(fit_data, cmap=plt.cm.Blues) plt.title("After biclustering; rearranged to show biclusters") plt.show()
km = KMeans(n_clusters=k) km = km.fit(df.iloc[:,2:14]) SS.append(km.inertia_) plt.plot(NC,SS) plt.xlabel('k') plt.ylabel('SS') plt.show() from sklearn.cluster.bicluster import SpectralCoclustering flavour=df.iloc[:,2:14] corr_whisky=pd.DataFrame.corr(flavour.transpose()) print(corr_whisky) plt.figure(figsize=(8,8)) plt.pcolor(corr_whisky) import pandas as pd plt.colorbar() model=SpectralCoclustering(n_clusters=5,random_state=45) x=df["Distillery"] df["disteliries_group"]=pd.Series(x,index=df.index) cluster=list(zip(df.iloc[:,1],df.iloc[:,13])) cluster=sorted(cluster, key=lambda x: x[1]) print("the resultant grouped classified whiskey based on their flavour") print("\n") c=pd.DataFrame(cluster) print(c) model1=pickle.dump(cluster,open('model1.pkl','wb'))
def createSpectralCoclustering(params): # params['n_clusters'] = N cls = SpectralCoclustering() return cls
cluster_colors = ["red", "orange", "green", "blue", "purple", "gray"] regions = [ "Speyside", "Highlands", "Lowlands", "Islands", "Campbelltown", "Islay" ] import numpy as np region_colors = dict(zip(regions, cluster_colors)) ## ENTER CODE HERE! ## print(region_colors) #from lectures import pandas as pd import pylab as plt whisky = pd.read_csv('whiskies.txt') whisky['Region'] = pd.read_csv('regions.txt') from sklearn.cluster.bicluster import SpectralCoclustering model = SpectralCoclustering(n_clusters=6, random_state=0) flavors = whisky.iloc[:, 2:14] corr_flavors = pd.DataFrame.corr(flavors) corr_whisky = pd.DataFrame.corr(flavors.transpose()) model.fit(corr_whisky) whisky['Group'] = pd.Series(model.row_labels_, index=whisky.index) whisky = whisky.ix[np.argsort(model.row_labels_)] whisky = whisky.reset_index(drop=True) correlations = pd.DataFrame.corr(whisky.iloc[:, 2:14].transpose()) correlations = np.array(correlations) distilleries = list(whisky.Distillery) correlation_colors = [] for i in range(len(distilleries)): for j in range(len(distilleries)): if correlations[i][
#build data data_init, rows, columns = make_biclusters(shape=(SIZE, SIZE), n_clusters=NB_CLUSTERS, noise=NOIZE, shuffle=False, random_state=0) # we dont want negative data data_init = np.absolute(data_init) #shuffle rows and columns! data, row_idx, col_idx = sg._shuffle(data_init, random_state=0) ######### sklearn algorithm ######### model = SpectralCoclustering(n_clusters=NB_CLUSTERS, random_state=0) model.fit(data) score = consensus_score(model.biclusters_, (rows[:, row_idx], columns[:, col_idx])) fit_data = data[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.column_labels_)] ################################# ######### our algorithm ######### custom_spectralcoclustering_result, r, c = bs.custom_spectral_biclustering( data, NB_CLUSTERS, K_MEANS_ITERATIONS) custom_score = consensus_score((r, c), (rows[:, row_idx], columns[:, col_idx])) ################################# ######### plot results part ######### print("consensus score sklearn: {:.3f}".format(score))