def main(): origin = open('10k.txt', 'r') lines = origin.readlines() x = [] label = [] for l in lines: l = l.split(',') ip1 = l[2].split('.') ip2 = l[3].split('.') d = [datetime.fromtimestamp(int(l[1][0:11])).hour, int("%02x%02x%02x%02x"%(int(ip1[0]),int(ip1[1]),int(ip1[2]),int(ip1[3])),16), int("%02x%02x%02x%02x" % (int(ip2[0]),int(ip2[1]),int(ip2[2]),int(ip2[3])),16)] + l[4:6] + l[7:10] x.append(d) data = np.array(x, dtype='float32') model = SpectralCoclustering(n_clusters=5) model.fit(data) print model.rows_ for i in range(5): print "Cluster" + str(i) + ':' for j in range(10000): if model.rows_[i][j]: print j, print ' '
def find_disjoint_biclusters(self, biclusters_number=50): data = np.asarray_chkfinite(self.matrix) data[data == 0] = 0.000001 coclustering = SpectralCoclustering(n_clusters=biclusters_number, random_state=0) coclustering.fit(data) biclusters = set() for i in range(biclusters_number): rows, columns = coclustering.get_indices(i) row_set = set(rows) columns_set = set(columns) if len(row_set) > 0 and len(columns_set) > 0: density = self._calculate_box_cluster_density(row_set, columns_set) odd_columns = set() for column in columns_set: col_density = self._calculate_column_density(column, row_set) if col_density < density / 4: odd_columns.add(column) columns_set.difference_update(odd_columns) if len(columns_set) == 0: continue odd_rows = set() for row in row_set: row_density = self._calculate_row_density(row, columns_set) if row_density < density / 4: odd_rows.add(row) row_set.difference_update(odd_rows) if len(row_set) > 0 and len(columns_set) > 0: density = self._calculate_box_cluster_density(row_set, columns_set) biclusters.add(Bicluster(row_set, columns_set, density)) return biclusters
def run(self, data): bc = SpectralCoclustering(n_clusters=(self.n_gene_classes, self.n_classes)) bc.fit(data) gene_clusters = bc.row_labels_ cell_clusters = bc.column_labels_ return cell_clusters
def test_spectral_coclustering(): # Test Dhillon's Spectral CoClustering on a simple problem. param_grid = { 'svd_method': ['randomized', 'arpack'], 'n_svd_vecs': [None, 20], 'mini_batch': [False, True], 'init': ['k-means++'], 'n_init': [10], 'n_jobs': [1] } random_state = 0 S, rows, cols = make_biclusters((30, 30), 3, noise=0.5, random_state=random_state) S -= S.min() # needs to be nonnegative before making it sparse S = np.where(S < 1, 0, S) # threshold some values for mat in (S, csr_matrix(S)): for kwargs in ParameterGrid(param_grid): model = SpectralCoclustering(n_clusters=3, random_state=random_state, **kwargs) model.fit(mat) assert_equal(model.rows_.shape, (3, 30)) assert_array_equal(model.rows_.sum(axis=0), np.ones(30)) assert_array_equal(model.columns_.sum(axis=0), np.ones(30)) assert_equal(consensus_score(model.biclusters_, (rows, cols)), 1) _test_shape_indices(model)
def test_spectral_coclustering(): """Test Dhillon's Spectral CoClustering on a simple problem.""" param_grid = {'svd_method': ['randomized', 'arpack'], 'n_svd_vecs': [None, 20], 'mini_batch': [False, True], 'init': ['k-means++'], 'n_init': [10], 'n_jobs': [1]} random_state = 0 S, rows, cols = make_biclusters((30, 30), 3, noise=0.5, random_state=random_state) S -= S.min() # needs to be nonnegative before making it sparse S = np.where(S < 1, 0, S) # threshold some values for mat in (S, csr_matrix(S)): for kwargs in ParameterGrid(param_grid): model = SpectralCoclustering(n_clusters=3, random_state=random_state, **kwargs) model.fit(mat) assert_equal(model.rows_.shape, (3, 30)) assert_array_equal(model.rows_.sum(axis=0), np.ones(30)) assert_array_equal(model.columns_.sum(axis=0), np.ones(30)) assert_equal(consensus_score(model.biclusters_, (rows, cols)), 1)
def update_bicluster(batch_df, task_df, compound_df, mode='RobustMT', K=5): if mode == 'RobustMT': n_tasks = task_df.shape[1] - 1 elif mode == 'ST': n_tasks = 1 elif mode == 'MT': n_tasks = task_df.shape[1] if not mode == 'ST': # cocluster of the minibatch predictive matrix X = preprocessing.scale(np.matrix(batch_df)[:, 0:n_tasks]) cocluster = SpectralCoclustering(n_clusters=K, random_state=0) cocluster.fit(X) batch_df['batch_label'] = cocluster.row_labels_ else: rank_x = batch_df[batch_df.columns[0]].rank().tolist() groups = pd.qcut(rank_x, K, duplicates='drop') batch_df['batch_label'] = groups.codes # generate color hex for batch_label lut = dict(zip(batch_df['batch_label'].unique(), Category20_20)) batch_df['batch_label_color'] = batch_df['batch_label'].map(lut) # generate color hex for compound_df lut2 = dict(zip(batch_df['Label_id'], batch_df['batch_label_color'])) compound_df['batch_label_color'] = compound_df['label'].map(lut2) lut22 = dict(zip(batch_df['Label_id'], batch_df['batch_label'])) compound_df['batch_label'] = compound_df['label'].map(lut22) groups = pd.qcut(compound_df['label'].tolist(), len(Category20b_20), duplicates='drop') c = [Category20b_20[xx] for xx in groups.codes] compound_df['label_color'] = c return batch_df, task_df, compound_df
def correlation_matrix(df): sns.set(style='white', font_scale=.9) clusters = 4 pearson = df.drop(['asset', 'unixtime'], axis=1).corr(method='pearson') clust = SpectralCoclustering(n_clusters=clusters, random_state=0) clust.fit(pearson) pearson = pearson.iloc[np.argsort(clust.row_labels_)[::-1], np.argsort(clust.column_labels_)] grid = dict(width_ratios=[1.5, pearson.shape[1]]) fig, axs = plt.subplots(1, 2, figsize=(10, 8), gridspec_kw=grid) sns.heatmap(data=np.sort(clust.row_labels_)[::-1].reshape(-1, 1), ax=axs[0], cbar=False, linewidths=.005, cmap=sns.color_palette('Spectral')) axs[0].set(xticks=(), yticks=()) sns.heatmap(data=pearson, cmap=sns.diverging_palette(220, 10, n=11), linewidths=.005, cbar_kws={'shrink': .75}, vmax=1, vmin=-1, ax=axs[1]) axs[1].set_xticklabels(pearson.columns, rotation='vertical') axs[1].set_yticklabels(pearson.index, rotation='horizontal') fig.suptitle(f'Variable Correlation Matrix in {clusters} Clusters', fontsize=20) fig.tight_layout(w_pad=.5, rect=(.03, 0, 1, .95)) fig.savefig('reports/correlation_matrix.png')
def print_similarity_matrix(sphns, model, model2=None): print " ", for phn1 in sphns: print phn1, " ", print "" m = np.ndarray((len(sphns), len(sphns)), dtype=np.float32) for i, phn1 in enumerate(sphns): print phn1.ljust(4) + ":", for j, phn2 in enumerate(sphns): sim = model.similarity(phn1, phn2) if model2 != None: sim -= model2.similarity(phn1, phn2) print "%0.2f" % sim, m[i][j] = sim print "" phn_order = [phn for phn in sphns] if BICLUSTER: #model = SpectralBiclustering(n_clusters=4, method='log', model = SpectralCoclustering(n_clusters=n_clusters, random_state=0) model.fit(m) print "INDICES:", indices = [model.get_indices(i) for i in xrange(n_clusters)] print indices tmp = [] for i in xrange(n_clusters): tmp.extend([phn_order[indices[i][0][j]] for j in xrange(len(indices[i][0]))]) phn_order = tmp fit_data = m[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.column_labels_)] m = fit_data return phn_order, m
def bi_clustering(data, args): print 'clustering...' # max_val = np.max(np.max(data)) # data = -np.exp(data / data.std()) max_val = np.max(np.max(data)) data[data == 0] = max_val data = data / max_val model = SpectralCoclustering(n_clusters=args.k, svd_method='arpack') model.fit(data) np.savetxt(args.o, model.row_labels_, fmt="%d", newline="\n") fit_data = data[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.column_labels_)] if not args.plot: return plt.matshow(shuffle(data), cmap=plt.cm.Blues) plt.title("Org dataset") plt.matshow(fit_data, cmap=plt.cm.Blues) plt.title("After biclustering; rearranged to show biclusters") plt.show()
def bicluster(*cotables): table = cotables[0] model = SpectralCoclustering(n_clusters=table.shape[1], random_state=0) model.fit(table.as_matrix()) return [ cotable.iloc[np.argsort(model.row_labels_), np.argsort(model.column_labels_)] for cotable in cotables ]
def spectral_co_cluster(data, n_clusters, para_jobs=1, random_state=None): from sklearn.cluster.bicluster import SpectralCoclustering model = SpectralCoclustering(n_clusters, random_state=random_state, n_jobs=para_jobs) model.fit(data) row_labels = model.row_labels_ col_labels = model.column_labels_ return row_labels, col_labels
def biclustering(input,num_clusters): global agent1_dict data = np.matrix(input) model = SpectralCoclustering(n_clusters=num_clusters,random_state=0) model.fit(data) #create agent 1 dictionary agent1_dict = {} for c in range(num_clusters): agent1_dict[c] = model.get_indices(c)[0].tolist() #0 row indices, 1 column indices return agent1_dict
def get_clusters(data): coclusters = SpectralCoclustering(n_clusters=5, random_state=0) coclusters.fit(data) word_clusters = [] hidden_clusters = [] for i in range(5): wc = coclusters.get_indices(i)[0] hc = coclusters.get_indices(i)[1] word_clusters.append(wc.tolist()) hidden_clusters.append(hc.tolist()) return word_clusters, hidden_clusters
def biclustering(data,num_clusters): clusters = {} data = np.asmatrix(data) model = SpectralCoclustering(n_clusters=num_clusters,random_state=0) #model = SpectralBiclustering(n_clusters=num_clusters) model.fit(data) for c in range(num_clusters): clusters[c] = model.get_indices(c)[0].tolist() #0 row indices, 1 column indices #fit_data = data[np.argsort(model.row_labels_)] #fit_data = fit_data[:, np.argsort(model.column_labels_)] #plot(fit_data) return clusters
def main(): origin = open('kddcup.txt', 'r') lines = origin.readlines() x = [] label = [] for l in lines: l = l.split(',') d = l[0:1] + l[4:19] + l[21:-1] label.append(l[-1]) x.append(d) data = np.array(x, dtype='float32') model = SpectralCoclustering(n_clusters=5) model.fit(data) evaluation = [] draw_n_x = [] draw_n_y = [] draw_a_x = [] draw_a_y = [] for cluster in model.rows_: normal = 0.0 attack = 0.0 graph_x = [] graph_y = [] for idx in range(len(cluster)): if cluster[idx]: if label[idx] == 'normal.\n': normal += 1 else: attack += 1 graph_x.append(data[27]) graph_y.append(data[30]) evaluation.append(normal / (normal + attack)) if normal > attack: draw_n_x += graph_x draw_n_y += graph_y else: draw_a_x += graph_x draw_a_y += graph_y pl.plot(draw_n_x, draw_n_y, 'ro') pl.plot(draw_a_x, draw_a_y, 'go') print evaluation pl.show()
def biclustering(input,num_clusters): global agent1_dict data = np.matrix(input) model = SpectralCoclustering(n_clusters=num_clusters,random_state=0) model.fit(data) #create agent 1 dictionary agent1_dict = {} for c in range(num_clusters): agent1_dict[c] = model.get_indices(c)[0].tolist() #0 row indices, 1 column indices fit_data = data[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.column_labels_)] plot(fit_data) return agent1_dict
def cluster_data(flavors, whisky): corr_whisky = pd.DataFrame.corr(flavors.transpose()) model = SpectralCoclustering(n_clusters=6, random_state=0) model.fit(corr_whisky) whisky['Group'] = pd.Series(model.row_labels_, index=whisky.index) whisky = whisky.ix[np.argsort(model.row_labels_)] whisky = whisky.reset_index(drop=True) correlation = pd.DataFrame.corr(whisky.iloc[:, 2:14].transpose()) correlation = np.array(correlation) # print(np.sum(model.rows_, axis=1)) # print(np.sum(model.rows_, axis=0)) # print(model.row_labels_) # print(correlation) plot_correlations(correlation)
def plot_coclusters_raw_data(time_ms, t=False): # take the transpose of sliced matrix if t: channels_data = slice_matrix(matrix, time_ms) else: channels_data = slice_matrix(matrix, time_ms) print len(channels_data), len(channels_data[1]) z_score = stats.zscore(channels_data) plt.title('Z Score Biclustering Over %i ms' % time_ms) spectral_model = SpectralCoclustering() spectral_model.fit(z_score) fit_data = z_score[np.argsort(spectral_model.row_labels_)] fit_data = fit_data[:, np.argsort(spectral_model.column_labels_)] plt.matshow(fit_data, cmap=plt.cm.Blues) plt.savefig('z_score_raw_coclustering_all_ts_%i_T_%s.svg' % (time_ms, str(t)))
def cocluster(self, mx, blockdiag=False): logging.info('Co-clustering Tade..') if blockdiag: logging.info('blockdiag') clusser = SpectralCoclustering(n_jobs=-1) else: # checkerboard logging.info('checkerboard') clusser = SpectralBiclustering(n_jobs=-1, n_clusters=(4,3)) #n_clusters=3, svd_method='randomized', clusser.fit(mx) logging.info('Argsorting mx rows..') mx = mx[np.argsort(clusser.row_labels_)] self.prev = self.prev[np.argsort(clusser.row_labels_)] logging.info('Argsorting mx cases..') mx = mx[:, np.argsort(clusser.column_labels_)] self.case = self.case[np.argsort(clusser.column_labels_)] return mx
def main(): files = [DATA_DIR + file for file in os.listdir(DATA_DIR) if fnmatch.fnmatch(file, '*.csv')] for i in files: print('processing', i, '...') table = get_data(i) cl = SpectralCoclustering(n_clusters=2, random_state=0) cl.fit(table) # using http://scikit-learn.org/stable/auto_examples/bicluster/plot_spectral_coclustering.html fit_data = table[np.argsort(cl.row_labels_)] fit_data = fit_data[:, np.argsort(cl.column_labels_)] plt.matshow(fit_data, cmap=plt.cm.Reds) plt.title(i[len(DATA_DIR):]) # plt.show() plt.savefig(i[len(DATA_DIR):-4] + '.pdf')
def main(model): store = pd.HDFStore(model) from_ = store['from_'][0][0] to = store['to'][0][0] assert from_ == 0 trace_fpath = store['trace_fpath'][0][0] Theta_zh = store['Theta_zh'].values Psi_oz = store['Psi_sz'].values count_z = store['count_z'].values[:, 0] Psi_oz = Psi_oz / Psi_oz.sum(axis=0) Psi_zo = (Psi_oz * count_z).T Psi_zo = Psi_zo / Psi_zo.sum(axis=0) obj2id = dict(store['source2id'].values) hyper2id = dict(store['hyper2id'].values) id2obj = dict((v, k) for k, v in obj2id.items()) ZtZ = Psi_zo.dot(Psi_oz) ZtZ = ZtZ / ZtZ.sum(axis=0) L = ZtZ #ZtZ[ZtZ < (ZtZ.mean())] = 0 L[ZtZ >= 1.0 / (len(ZtZ))] = 1 L[L != 1] = 0 colormap = toyplot.color.brewer.map("Purples", domain_min=0, domain_max=1, reverse=True) print(colormap) canvas = toyplot.matrix((L.T, colormap), label="P[z' | z]", \ colorshow=False, tlabel="To z'", llabel="From")[0] #canvas.axes(ylabel='From z', xlabel='To z\'') toyplot.pdf.render(canvas, 'tmat.pdf') model = SpectralCoclustering(n_clusters=3) model.fit(L) fit_data = L[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.column_labels_)] canvas = toyplot.matrix((fit_data, colormap), label="P[z' | z']", \ colorshow=False)[0] toyplot.pdf.render(canvas, 'tmat-cluster.pdf') #AtA = Psi_oz.dot(Psi_zo) #np.fill_diagonal(AtA, 0) #AtA = AtA / AtA.sum(axis=0) store.close()
def cocluster(self, mx, blockdiag=False): logging.info('Co-clustering Tade..') if blockdiag: logging.info('blockdiag') clusser = SpectralCoclustering(n_jobs=-1) else: # checkerboard logging.info('checkerboard') clusser = SpectralBiclustering(n_jobs=-1, n_clusters=(4, 3)) #n_clusters=3, svd_method='randomized', clusser.fit(mx) logging.info('Argsorting mx rows..') mx = mx[np.argsort(clusser.row_labels_)] self.prev = self.prev[np.argsort(clusser.row_labels_)] logging.info('Argsorting mx cases..') mx = mx[:, np.argsort(clusser.column_labels_)] self.case = self.case[np.argsort(clusser.column_labels_)] return mx
def biclustering(db): #mydata = genfromtxt('/home/fan/intern/process_db/analysis/viewtime_matrix_524.csv',dtype=None,delimiter=',',names=True,skip_header=1) df = pd.read_csv( '/home/fan/intern/process_db/analysis/viewtime_matrix_501_0.1.csv') dma = 501 #print df.head() print df.shape dev_list = df.ix[:, 0].values prog_list = df.columns.values #print type(dev_list) #print type(prog_list) df.drop(df.columns[0], axis=1, inplace=True) #df[df==0] = 1 df = df.apply(fraction, axis=1) #print df.head() #print df.values #print type(df.values) #mydata = df.values #mydata=np.delete(mydata, 0, axis=0) #mydata=np.delete(mydata, 0, axis=1) #mydata[mydata==0] = 0.01 #print 'data format is:',mydata,type(mydata) # model=SpectralCoclustering(n_clusters=5, random_state=0) #n_clusters=(1000,20) # 4*3 = 12 clusters #model = SpectralBiclustering(random_state=None) model = SpectralCoclustering(n_clusters=10) model.fit(df) #fit_data=mydata[np.argsort(model.row_labels_)] #fit_data=fit_data[:,np.argsort(model.column_labels_)] #plt.matshow(fit_data[0:40],cmap=plt.cm.Blues) # plt.show() print model.get_params() for i in range(0, 5): print 'Size of one cluster:', model.get_shape(i) indices = model.get_indices(i) #print indices[1] print prog_list[indices[1]] print model.get_submatrix(i, df.values) dev_in_cluster = dev_list[indices[0]] #print type(dev_in_cluster) print 'number of devices within this cluster:', len(dev_in_cluster) get_income(db, dma, dev_in_cluster.tolist())
def biclustering(input_list,num_clusters): global agent1_dict #clustering agent 1 data = np.matrix(input_list) #plot(data)#original data #model = SpectralBiclustering(n_clusters=num_clusters) #Biclustering refer http://scikit-learn.org/stable/auto_examples/bicluster/plot_spectral_biclustering.html#example-bicluster-plot-spectral-biclustering-py model = SpectralCoclustering(n_clusters=num_clusters,random_state=0) #Coclustering refer http://scikit-learn.org/stable/auto_examples/bicluster/plot_spectral_coclustering.html model.fit(data) #create agent 1 dictionary agent1_dict = {} for c in range(num_clusters): agent1_dict[c] = model.get_indices(c)[0].tolist() #0 row indices, 1 column indices fit_data = data[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.column_labels_)] plot(fit_data) return agent1_dict
def spectral_coclustering(cls, *args): """ Wrapper method for the spectral_coclustering algorithm :param args: the arguments to be sent to the sci-kit implementation :return: returns the Biclustering object """ model = SpectralCoclustering(*args) return cls(model)
def Spectral_CoClustering(args): '''Function to perform bipartite clustering''' # Create model try: if args.arpack: model = SpectralCoclustering( n_clusters=args.nClusters, svd_method='arpack') else: model = SpectralCoclustering( n_clusters=args.nClusters) except: print '-r 1 may cause problems when svd_method has been set to arpack' print('Running coclustering') model.fit(args.M.tocsc()) print('Coclustering done') # Fit to data # fit_data = args.M[np.argsort(model.row_labels_)] # fit_data = fit_data[:, np.argsort(model.column_labels_)] fit_data = args.M.tocoo() fit_data.row = invert_permutation(np.argsort(model.row_labels_))[fit_data.row] fit_data.col = invert_permutation(np.argsort(model.column_labels_))[fit_data.col] save_clusters(model, fit_data, args, '_CoClustering') return model, fit_data
def plot_biclusters(): co_grid = ParameterGrid( {'n_clusters': np.arange(2, 10, 1), 'n_init': [20]} ) _y = pd.read_csv('./../../data_source/to_analysis/original_images/dfs_original_images.csv', index_col=0) y_orig = np.squeeze(_y.values) X_orig = pd.read_csv('./../../data_source/to_analysis/original_images/all_features_original_images.csv', index_col=0) scaler = StandardScaler() X_orig_std = scaler.fit_transform(X_orig.values) #_run_experiment(co_grid, X_orig_std) df_avg_co_scores = pd.read_csv('bic_scores.csv', index_col=0) best_co_config = co_grid[ np.argmin(df_avg_co_scores.loc[:, 'tvr'].values) - 1 ] print(best_co_config, min(df_avg_co_scores.loc[:, 'tvr'].values)) orig_co_model = SpectralCoclustering(random_state=0, svd_method='arpack') orig_co_model.set_params(**best_co_config) orig_co_model.fit(X_orig_std) #plt.figure() #_plot_tve(df_avg_co_scores, co_grid) plt.figure() _plot_bicmaps(X_orig_std, best_co_config)
def cluster_ex_by_feature_matrix(sub_ex_by_feat_mat, plot_file): if sub_ex_by_feat_mat.shape[0] > 50000: print "Matrix too large to be efficient, pleased reduce number of examples" # Subset down to motifs that are used plot_df = sub_ex_by_feat_mat[:, np.apply_along_axis( np.max, 0, sub_ex_by_feat_mat.toarray() ) != 0] # for numpy array plot_df = sub_ex_by_feat_mat_1[np.apply_along_axis( lambda row: (row != 0).sum(), 1, sub_ex_by_feat_mat_1.toarray()) > 10, :] plot_df = plot_df[:, np.apply_along_axis(lambda column: (column != 0).sum(), 0, sub_ex_by_feat_mat_1.toarray()) > 50] # for pandas plot_df = sub_ex_by_feat_df2.ix[ sub_ex_by_feat_df2.apply(lambda row: (row != 0).sum(), 1) > 10, :] plot_df = plot_df.ix[:, plot_df.apply(lambda row: (row != 0).sum(), 0) > 50] plot_df = sub_ex_by_feat_df2 np.apply_along_axis(lambda column: (column != 0).sum(), 0, sub_ex_by_feat_mat_1.toarray()) model = SpectralCoclustering(n_clusters=50) model.fit(plot_df) # fits for 50K fit_data = plot_df.ix[np.argsort(model.row_labels_)] fit_data = fit_data.ix[:, np.argsort(model.column_labels_)] plt.figure() plt.matshow(fit_data.ix[0:500, ], cmap=plt.cm.YlGnBu, aspect='auto') plt.savefig(plot_file) print "DONE: biclustering plot here: {0}".format(plot_file) return "pretty picture"
def _plot_bicmaps(X_orig_std, best_co_config): # Train model with best config. orig_co_model = SpectralCoclustering(random_state=0, svd_method='arpack') orig_co_model.set_params(**best_co_config) orig_co_model.fit(X_orig_std) orig_co_row_sorted = X_orig_std[np.argsort(orig_co_model.row_labels_), :] orig_co_fit_data = orig_co_row_sorted[:, np.argsort(orig_co_model.column_labels_)] hmap = sns.heatmap( orig_co_fit_data, robust=True, cmap=plt.cm.viridis, fmt='f', vmin=np.min(orig_co_fit_data), vmax=np.max(orig_co_fit_data), cbar=False ) coords = bic_coords(orig_co_model, best_co_config['n_clusters']) for num in coords.index: plt.plot( (coords.loc[num, ['x1', 'x2', 'x2', 'x1', 'x1']]), (coords.loc[num, ['y1', 'y1', 'y2', 'y2', 'y1']]), c='darkred' ) plt.ylabel('Patients') plt.xlabel('Features') plt.yticks([], []) plt.xticks([], []) ax_divider = make_axes_locatable(hmap) cax = ax_divider.append_axes('right', size='3%', pad='2%') colorbar.colorbar( hmap.get_children()[0], cax=cax, orientation='vertical' ) #cax.xaxis.set_label_text('AUC', fontname='Sans') #cax.xaxis.set_label_position('top') cbar_ticks = np.linspace( np.nanmin(orig_co_fit_data), np.nanmax(orig_co_fit_data), 6 ) cax.yaxis.set_ticks(cbar_ticks) cax.yaxis.set_ticklabels([f'{num:.01f}' for num in cbar_ticks]) plt.savefig( '../biclustering/bic_map_original_images.pdf', bbox_inches='tight', transparent=True, dpi=CONFIG.DPI, )
def _get_clusters_using_spectrals(corrarr, n_clusters=5, mode='co'): if mode=='co': model = SpectralCoclustering(n_clusters, random_state=0) model.fit(corrarr) indices = np.arange(corrarr.columns.size) clusters = [indices[x].tolist() for x in model.columns_] return clusters elif mode=='bi': model = SpectralBiclustering(n_clusters, random_state=0) model.fit(corrarr) indices = np.arange(corrarr.columns.size) clusters = [indices[x].tolist() for x in model.columns_] repetition_start = clusters[1:].index(clusters[0]) + 1 return clusters[:repetition_start] else: raise("Mode wrong?")
def _run_experiment(co_grid, X_orig_std): np.random.seed(seed=0) random_states = np.random.choice(40, size=40) avg_co_scores = {} for num, co_param_config in enumerate(co_grid): orig_co_scores = [] for random_state in random_states: orig_co_model = SpectralCoclustering(random_state=random_state, svd_method='arpack') # NOTE: Outputs a TVE score. orig_co_clusters = biclusters(orig_co_model, X_orig_std, co_param_config) orig_co_scores.append(orig_co_clusters.external_metrics.values) avg_co_scores[num] = np.nanmean(orig_co_scores, axis=0) avg_orig_co_scores = [] for num, scores in enumerate(avg_co_scores.values()): avg_orig_co_scores.append(np.mean(scores, axis=0)) df_avg_co_scores = pd.DataFrame(avg_orig_co_scores, columns=['tvr']) df_avg_co_scores.index.name = 'ConfigID' df_avg_co_scores.to_csv('bic_scores.csv')
def Block_diagonal(input_path,top_sd,n_clusters,output_path): ###input data input_dat=pd.read_csv(input_path,index_col=0,sep='\t',comment='#') ### get index and sample name # get_index = input_dat.index.astype(str)+'_'+input_dat.ix[:,0].astype(str)+'_'+\ # input_dat.ix[:,1].astype(str)+'_'+input_dat.ix[:,2].astype(str) # get_samp_name = input_dat.columns[3:] pro_dat = input_dat.fillna(0) # pro_dat = pro_dat.ix[:,3:] # pro_dat.index = get_index # pro_dat.columns = get_samp_name # pro_dat = 2**pro_dat-1 df_sd = pro_dat.apply(np.std,axis=1) df_sd_sort = df_sd.sort_values(ascending = False) df_sd_sort_top = df_sd_sort.ix[:int(len(df_sd_sort)*top_sd)] pro_dat = pro_dat.ix[df_sd_sort_top.index,:] sd_index = pro_dat.index sd_sample_names = pro_dat.columns #plt.matshow(pro_dat, cmap=plt.cm.Blues) #plt.title("Original dataset") ### model model = SpectralCoclustering(n_clusters=n_clusters, random_state=0) model.fit(pro_dat) pro_dat = np.array(pro_dat) fit_data = pro_dat[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.column_labels_)] ### output the model fitting data fit_data = pd.DataFrame(fit_data) fit_data.index = sd_index[np.argsort(model.row_labels_)] fit_data.columns = sd_sample_names[np.argsort(model.column_labels_)] out_fit_data_path = os.path.join(output_path,'fit_data.csv') fit_data.to_csv(out_fit_data_path) ### output image fig = plt.figure(figsize=(20,40)) ax = fig.add_subplot(111) ax.matshow(fit_data, cmap=plt.cm.Blues) #cax = ax.matshow(pro_dat, interpolation='nearest') #fig.colorbar(cax) ax.set_title("After biclustering; rearranged to show biclusters") # ax.set_xticklabels(fit_data.columns ) # ax.set_yticklabels(fit_data.index) out_img_path = os.path.join(output_path,'bicluster.png') fig.savefig(out_img_path) ### output module a11 = pd.Series(model.row_labels_) b11 = pd.Series(model.column_labels_) c11 = a11.groupby(a11).size() c22 = b11.groupby(b11).size() d11 = pd.DataFrame(a11.sort_values().values,fit_data.index.values) d22 = pd.DataFrame(b11.sort_values().values,fit_data.columns.values) d11.columns = ['cpg_module'] d22.columns = ['sample_module'] out_module_path = os.path.join(output_path,'output.xlsx') writer = pd.ExcelWriter(out_module_path) d11.to_excel(writer,'Sheet1') d22.to_excel(writer,'Sheet2') writer.save() # print("\n") print("cpg module:") print(c11) print("\n") print("sample module:") print(c22)
for row in ratings: user_id = user_ids.index(row[0]) profile_id = profile_ids.index(row[1]) user_profile_matrix[user_id,profile_id] = row[2] #find number of users and movies in each bicluster '''G = nx_graph_from_biadjacency_matrix(user_movie_matrix) nx.draw(G) plt.show()''' #initialize and carry out clustering K=50 scc = SpectralCoclustering(n_clusters = K,svd_method='arpack') scc.fit(user_profile_matrix) #labels row_labels = scc.row_labels_ column_labels = scc.column_labels_ bicluster_num_users=np.zeros(K) bicluster_num_profiles=np.zeros(K) bicluster_list_users=[] bicluster_list_profiles=[] for i in range(K): bicluster_list_users.append([])
# exclude 'comp.os.ms-windows.misc' categories = ['alt.atheism', 'comp.graphics', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc'] newsgroups = fetch_20newsgroups(categories=categories) y_true = newsgroups.target vectorizer = TfidfVectorizer(stop_words='english', min_df=5, tokenizer=number_aware_tokenizer) cocluster = SpectralCoclustering(n_clusters=len(categories), svd_method='arpack', random_state=0) kmeans = MiniBatchKMeans(n_clusters=len(categories), batch_size=20000, random_state=0) print("Vectorizing...") X = vectorizer.fit_transform(newsgroups.data) print("Coclustering...") start_time = time() cocluster.fit(X) y_cocluster = cocluster.row_labels_ print("Done in {:.2f}s. V-measure: {:.4f}".format( time() - start_time, v_measure_score(y_cocluster, y_true))) print("MiniBatchKMeans...")
import pandas as pd from bokeh.models import HoverTool, ColumnDataSource from bokeh.plotting import figure, output_file, show from sklearn.cluster.bicluster import SpectralCoclustering data = pd.read_csv('docs/whiskies.txt') data['Region'] = pd.read_csv('docs/regions.txt') correlations = np.array(data.iloc[:, 2:14].transpose().corr()) plt.figure(figsize=(10, 4)) plt.subplot(121) plt.title("Original") plt.pcolor(correlations, cmap='inferno') plt.colorbar() model = SpectralCoclustering(n_clusters=6, random_state=0) model.fit(correlations) data['Group'] = model.row_labels_ data = data.ix[np.argsort(model.row_labels_)] data = data.reset_index(drop=True) correlations = correlations[np.argsort(model.row_labels_), :] correlations = correlations[:, np.argsort(model.row_labels_)] plt.subplot(122) plt.title("Rearranged") plt.pcolor(correlations, cmap='inferno') plt.colorbar() plt.savefig('plots/classifying_whiskies_1') group_colors = ['red', 'yellow', 'green', 'blue', 'purple', 'orange'] correlation_colors = [] for i in range(len(correlations)):
from sklearn.cluster.bicluster import SpectralCoclustering from sklearn.metrics import consensus_score data, rows, columns = make_biclusters(shape=(300, 300), n_clusters=5, noise=5, shuffle=False, random_state=0) plt.matshow(data, cmap=plt.cm.Blues) plt.title("Original dataset") data, row_idx, col_idx = sg._shuffle(data, random_state=0) plt.matshow(data, cmap=plt.cm.Blues) plt.title("Shuffled dataset") model = SpectralCoclustering(n_clusters=6, random_state=0) model.fit(data) score = consensus_score(model.biclusters_, (rows[:, row_idx], columns[:, col_idx])) print("consensus score: {:.3f}".format(score)) fit_data = data[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.column_labels_)] plt.matshow(fit_data, cmap=plt.cm.Blues) plt.title("After biclustering; rearranged to show biclusters") plt.show()
plt.figure(figsize=(15, 25)) sns.heatmap(cluster_counts / cluster_counts.sum(1)[:, np.newaxis], yticklabels=cluster_cell_types, vmin=0, vmax=1, linewidths=0.5) plt.xlabel('UNCURL clusters') plt.ylabel('Seurat clusters') plt.title('SCH Cerebellum Clusters') plt.savefig('uncurl_vs_seurat_clusters.png', dpi=200) # do a biclustering from sklearn.cluster.bicluster import SpectralCoclustering spec = SpectralCoclustering(18) cluster_counts_subset = np.vstack( [cluster_counts[:31, :], cluster_counts[32:, :]]) spec.fit(cluster_counts + 0.0001) row_labels = spec.row_labels_ column_labels = spec.column_labels_ row_order = np.argsort(row_labels) col_order = np.argsort(column_labels) #row_labels = row_labels[row_order] #col_labels = column_labels[col_order] cluster_counts_reordered = cluster_counts[row_order, :] cluster_counts_reordered = cluster_counts_reordered[:, col_order] cluster_cell_types_2 = np.array(
def test_co_clustering(): import numpy as np import nibabel as nb from matplotlib import pyplot as plt import sklearn as sk from sklearn.datasets import make_biclusters from sklearn.datasets import samples_generator as sg from sklearn.cluster.bicluster import SpectralCoclustering from sklearn.metrics import consensus_score # REAL DATA subject_file = '/Users/aki.nikolaidis/Desktop/NKI_SampleData/A00060280/3mm_bandpassed_demeaned_filtered_antswarp.nii.gz' roi_mask_file = home + '/git_repo/basc/masks/BG_3mm.nii.gz' roi2_mask_file = home + '/git_repo/basc/masks/yeo2_3mm.nii.gz' data = nb.load(subject_file).get_data().astype('float32') print('Data Loaded') print('Setting up NIS') roi_mask_file_nb = nb.load(roi_mask_file) roi2_mask_file_nb = nb.load(roi2_mask_file) roi_mask_nparray = nb.load(roi_mask_file).get_data().astype( 'float32').astype('bool') roi2_mask_nparray = nb.load(roi2_mask_file).get_data().astype( 'float32').astype('bool') roi1data = data[roi_mask_nparray] roi2data = data[roi2_mask_nparray] #add code that uploads the roi1data and roi2data, divides by the mean and standard deviation of the timeseries roi1data = sk.preprocessing.normalize(roi1data, norm='l2') roi2data = sk.preprocessing.normalize(roi2data, norm='l2') dist_btwn_data_1_2 = np.array( sp.spatial.distance.cdist(roi1data, roi2data, metric='correlation')) sim_btwn_data_1_2 = 1 - dist_btwn_data_1_2 sim_btwn_data_1_2[np.isnan(sim_btwn_data_1_2)] = 0 sim_btwn_data_1_2[sim_btwn_data_1_2 < 0] = 0 sim_btwn_data_1_2 = sim_btwn_data_1_2 + (np.random.rand( len(sim_btwn_data_1_2), len(sim_btwn_data_1_2[1, :]))) / 100 sim_btwn_data_1_2[sim_btwn_data_1_2 > 1] = 1 sum(sum(sim_btwn_data_1_2 == np.inf)) sum(sum(sim_btwn_data_1_2 == np.nan)) model = SpectralCoclustering(n_clusters=5, random_state=0, n_init=100) model.fit(sim_btwn_data_1_2) fit_data = sim_btwn_data_1_2[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.column_labels_)] plt.matshow(fit_data, cmap=plt.cm.Blues) plt.title("After biclustering; rearranged to show biclusters") plt.show() #SIMULATION DATA import numpy as np from matplotlib import pyplot as plt from sklearn.datasets import make_biclusters from sklearn.datasets import samples_generator as sg from sklearn.cluster.bicluster import SpectralCoclustering from sklearn.metrics import consensus_score #Creating Simulated Data data, rows, columns = make_biclusters(shape=(300, 100), n_clusters=5, noise=5, shuffle=False, random_state=0) plt.matshow(data, cmap=plt.cm.Blues) plt.title("Original dataset") data, row_idx, col_idx = sg._shuffle(data, random_state=0) plt.matshow(data, cmap=plt.cm.Blues) plt.title("Shuffled dataset") #Creating Model model = SpectralCoclustering(n_clusters=5, random_state=0) model.fit(data) score = consensus_score(model.biclusters_, (rows[:, row_idx], columns[:, col_idx])) print("consensus score: {:.3f}".format(score)) fit_data = data[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.column_labels_)] plt.matshow(fit_data, cmap=plt.cm.Blues) plt.title("After biclustering; rearranged to show biclusters") plt.show() #################################################################### #################################################################### from sklearn import cluster import scipy as sp import time from sklearn import cluster, datasets import numpy as np from matplotlib import pyplot as plt from sklearn.datasets import make_biclusters from sklearn.datasets import samples_generator as sg from sklearn.cluster.bicluster import SpectralCoclustering from sklearn.metrics import consensus_score data1 = generate_simple_blobs(27) data2 = generate_simple_blobs(27) data2 = data2[0:150, :] print("Calculating Cross-clustering") print("Calculating pairwise distances between areas") dist_btwn_data_1_2 = np.array( sp.spatial.distance.cdist(roi1data, roi2data, metric='correlation')) sim_btwn_data_1_2 = 1 - dist_btwn_data_1_2 sim_btwn_data_1_2[sim_btwn_data_1_2 < 0] = 0 co_cluster = cluster.SpectralCoclustering() co_cluster.fit(sim_btwn_data_1_2) score = consensus_score(co_cluster.biclusters_, (rows[:, row_idx], columns[:, col_idx])) print("consensus score: {:.3f}".format(score)) fit_data = data[np.argsort(co_cluster.row_labels_)] fit_data = fit_data[:, np.argsort(co_cluster.column_labels_)] plt.matshow(fit_data, cmap=plt.cm.Blues) plt.title("After biclustering; rearranged to show biclusters") plt.show()
listOfAbstracts = [] for paper in papers: if 'Abstract' in paper['MedlineCitation']['Article'].keys(): listOfAbstracts.append(mergeAbstract(paper['MedlineCitation']['Article']['Abstract']['AbstractText'])) # Create TF-IDF matrix vect = TfidfVectorizer(max_df = 1) tfidf = vect.fit_transform(listOfAbstracts) # Non-negative Matrix Factorization num_topics = 2 num_top_words = 5 nmf = decomposition.NMF(n_components=num_topics, random_state=1) doctopic = nmf.fit_transform(tfidf) topic_words = [] vocab = np.array(vect.get_feature_names()) for topic in nmf.components_: word_idx = np.argsort(topic)[::-1][0:num_top_words] topic_words.append([vocab[i] for i in word_idx]) # Coclustering cocluster = SpectralCoclustering(n_clusters=5,svd_method='arpack', random_state=0) cocluster.fit(tfidf) y_cocluster = cocluster.row_labels_ x_cocluster = cocluster.column_labels_ # print(np.array(vect.get_feature_names())[x_cocluster == 4])
#mM[np.where(np.logical_and(mM >= 0.25, mM < 1.25))] = 0.5 #mM[mM >= 1.25] = 1 ##### matDF = pd.DataFrame(MatOut).set_index(np.array(indx)) matDF.columns = areas # Original plot plt.matshow(MatOut, cmap=plt.cm.Blues) plt.title("Original dataset") clusters = 8 #6 model = SpectralCoclustering(n_clusters=clusters) #model = SpectralBiclustering(n_clusters=clusters) model.fit(matDF) fitData_c = matDF.columns[np.argsort(model.column_labels_)] matDF = matDF[fitData_c] fitData_i = matDF.index[ np.argsort(model.row_labels_)] matDF = matDF.reindex(fitData_i) column_names = np.array([i[13:16] for i in fitData_c]) # plot fig = plt.figure() ax = fig.add_subplot(111)
km = KMeans(n_clusters=k) km = km.fit(df.iloc[:,2:14]) SS.append(km.inertia_) plt.plot(NC,SS) plt.xlabel('k') plt.ylabel('SS') plt.show() from sklearn.cluster.bicluster import SpectralCoclustering flavour=df.iloc[:,2:14] corr_whisky=pd.DataFrame.corr(flavour.transpose()) print(corr_whisky) plt.figure(figsize=(8,8)) plt.pcolor(corr_whisky) import pandas as pd plt.colorbar() model=SpectralCoclustering(n_clusters=5,random_state=45) x=df["Distillery"] df["disteliries_group"]=pd.Series(x,index=df.index) cluster=list(zip(df.iloc[:,1],df.iloc[:,13])) cluster=sorted(cluster, key=lambda x: x[1]) print("the resultant grouped classified whiskey based on their flavour") print("\n") c=pd.DataFrame(cluster) print(c) model1=pickle.dump(cluster,open('model1.pkl','wb'))
continue # TODO hack: skip very long lists if skip_thresh and len(sources) > skip_thresh: continue # All events have numbered tweets rowSelector = np.array([row_lookup[source] for source in sources]) data[rowSelector, j] = 1 plt.matshow(data, cmap=plt.cm.Blues) plt.title("Original dataset") plt.savefig('%s_original.png' % (identifier), bbox_inches='tight') model = SpectralCoclustering(n_clusters=n_clusters, random_state=0) model.fit(data) fit_data = data[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.column_labels_)] plt.matshow(fit_data, cmap=plt.cm.Blues) plt.title("After biclustering; rearranged") plt.savefig('%s_clustered.png' % (identifier), bbox_inches='tight') avg_data = np.copy(data) # Compute average value in each co-cluster for display purposes for c in range(n_clusters): for d in range(n_clusters):
print(coOccurencesMatrix) print(coOccurencesMatrix.shape) hashtags = vectorizer.get_feature_names() hashtags = np.array(hashtags) coOccurencesMatrix = np.where(coOccurencesMatrix == 0, 0, coOccurencesMatrix) #coOccurencesMatrix = StandardScaler().fit_transform(coOccurencesMatrix) print(coOccurencesMatrix) import copy coOccurencesMatrix2 = copy.deepcopy(coOccurencesMatrix) coOccurencesMatrix2 = np.corrcoef(coOccurencesMatrix2) coOccurencesMatrix = np.corrcoef(coOccurencesMatrix) nbClusters = 40 model = SpectralCoclustering(n_clusters=nbClusters, random_state=1) model.fit(coOccurencesMatrix) print("fit") print(coOccurencesMatrix) fit_data = coOccurencesMatrix[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.column_labels_)] hashtagsrow = hashtags[np.argsort(model.row_labels_)] hashtagscolumns = hashtags[np.argsort(model.column_labels_)] print("rowlavels") print(model.row_labels_) print("columnlzbels") print(model.column_labels_) print("hashtags") print(hashtags) print(fit_data.shape) print(fit_data)
corr_flavors = pd.DataFrame.corr(flavors) corr_flavors plt.figure(figsize=(10,10)) plt.pcolor(corr_flavors) plt.colorbar() plt.savefig('./python_case_studies/whisky/corr_flavors.pdf') corr_whisky = pd.DataFrame.corr(flavors.transpose()) plt.figure(figsize=(10,10)) plt.pcolor(corr_whisky) plt.axis('tight') plt.colorbar() plt.savefig('./python_case_studies/whisky/corr_whisky.pdf') model = SpectralCoclustering(n_clusters=6, random_state=0) model.fit(corr_whisky) model.rows_ np.sum(model.rows_, axis=1) np.sum(model.rows_, axis=0) model.row_labels_ whisky['Group'] = pd.Series(model.row_labels_, index=whisky.index) whisky = whisky.ix[np.argsort(model.row_labels_)] whisky = whisky.reset_index(drop=True) correlations = pd.DataFrame.corr(whisky.iloc[:,2:14].transpose()) correlations = np.array(correlations)
os.mkdir('solution') #n_clusters = (3, 2) n_clusters = 20 arq = open('dados_v2.txt') dados = np.array([map(float, a.split('\t')[:-1]) for a in arq.readlines()]) plt.matshow(zip(*dados), cmap=cm.PiYG) plt.title("Original dataset") pl.savefig('solution/original.png', bbox_inches=0) #model = SpectralBiclustering(n_clusters=n_clusters, method='log', random_state=0) model = SpectralCoclustering(n_clusters=n_clusters, svd_method='arpack', random_state=0) model.fit(dados) fit_data = dados[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.column_labels_)] plt.matshow(zip(*fit_data), cmap=cm.PiYG) pl.savefig('solution/biclustered.png', bbox_inches=0) plt.title("After biclustering; rearranged to show biclusters") plt.matshow(zip(*np.outer( np.sort(model.row_labels_) + 1, np.sort(model.column_labels_) + 1)), cmap=plt.cm.PiYG) plt.title("Checkerboard structure of rearranged data") pl.savefig('solution/biclustered and rearranged.png', bbox_inches=0)
this form of dimensionality reduction, some methods may perform better. """ return ("#NUMBER" if token[0].isdigit() else token for token in tokens) class NumberNormalizingVectorizer(TfidfVectorizer): def build_tokenizer(self): tokenize = super(NumberNormalizingVectorizer, self).build_tokenizer() return lambda doc: list(number_normalizer(tokenize(doc))) dir = ROOT_DIR+'\\processed_data\\' data = pickle.load(open(dir+'FT_raw_corpus_2013.p', 'rb')); vectorizer = NumberNormalizingVectorizer(stop_words='english', min_df=5) cocluster = SpectralCoclustering(n_clusters= 20, svd_method='arpack', random_state=0) kmeans = MiniBatchKMeans(n_clusters=20, batch_size=20000, random_state=0) print("Vectorizing...") X = vectorizer.fit_transform(data) print("Coclustering...") start_time = time() cocluster.fit(X) y_cocluster = cocluster.row_labels_ print("MiniBatchKMeans...") start_time = time() y_kmeans = kmeans.fit_predict(X)
from sklearn.datasets import make_biclusters from sklearn.datasets import samples_generator as sg from sklearn.cluster.bicluster import SpectralCoclustering from sklearn.metrics import consensus_score data, rows, columns = make_biclusters( shape=(300, 300), n_clusters=5, noise=5, shuffle=False, random_state=0) plt.matshow(data, cmap=plt.cm.Blues) plt.title("Original dataset") data, row_idx, col_idx = sg._shuffle(data, random_state=0) plt.matshow(data, cmap=plt.cm.Blues) plt.title("Shuffled dataset") model = SpectralCoclustering(n_clusters=5, random_state=0) model.fit(data) score = consensus_score(model.biclusters_, (rows[:, row_idx], columns[:, col_idx])) print "consensus score: {:.3f}".format(score) fit_data = data[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.column_labels_)] plt.matshow(fit_data, cmap=plt.cm.Blues) plt.title("After biclustering; rearranged to show biclusters") plt.show()
cluster_colors = ["red", "orange", "green", "blue", "purple", "gray"] regions = [ "Speyside", "Highlands", "Lowlands", "Islands", "Campbelltown", "Islay" ] import numpy as np region_colors = dict(zip(regions, cluster_colors)) ## ENTER CODE HERE! ## print(region_colors) #from lectures import pandas as pd import pylab as plt whisky = pd.read_csv('whiskies.txt') whisky['Region'] = pd.read_csv('regions.txt') from sklearn.cluster.bicluster import SpectralCoclustering model = SpectralCoclustering(n_clusters=6, random_state=0) flavors = whisky.iloc[:, 2:14] corr_flavors = pd.DataFrame.corr(flavors) corr_whisky = pd.DataFrame.corr(flavors.transpose()) model.fit(corr_whisky) whisky['Group'] = pd.Series(model.row_labels_, index=whisky.index) whisky = whisky.ix[np.argsort(model.row_labels_)] whisky = whisky.reset_index(drop=True) correlations = pd.DataFrame.corr(whisky.iloc[:, 2:14].transpose()) correlations = np.array(correlations) distilleries = list(whisky.Distillery) correlation_colors = [] for i in range(len(distilleries)): for j in range(len(distilleries)): if correlations[i][
print(len(user_movie_matrix)) print(len(user_movie_matrix[0])) #print(user_movie_matrix) print(type(user_movie_matrix)) #find number of users and movies in each bicluster '''G = nx_graph_from_biadjacency_matrix(user_movie_matrix) nx.draw(G) plt.show()''' #initialize and carry out clustering K=50 #km = KMeans(n_clusters = K) #km.fit(user_movie_matrix) scc = SpectralCoclustering(n_clusters = K,svd_method='arpack') scc.fit(user_movie_matrix) #labels row_labels = scc.row_labels_ column_labels = scc.column_labels_ bicluster_num_users=np.zeros(K) bicluster_num_movies=np.zeros(K) #maintain a list of users per bicluster bicluster_list_users=[] #maintain a list of movies per bicluster bicluster_list_movies=[] for i in range(K): bicluster_list_users.append([])