def main(): local_path = os.getcwd() src_name = 'pretestData.csv' src_path = os.path.join(local_path, src_name) pd_data = pd.read_csv(src_path, header = 0, index_col = 0) int_col = [int(x) for x in pd_data.columns.values] pd_data.columns = int_col data = (pd_data.values) * 100 print data.shape, type(data) k = 4 model = ChengChurch(n_clusters=k, max_msr=100, deletion_threshold=1.1, inverse_rows=True, random_state=0) model.fit(data) nau = np.array([]) nam = np.array([]) for i in range(k): print model.get_indices(i) print model.get_shape(i) nau = np.append(nau, model.get_indices(i)[0]) nam = np.append(nam, model.get_indices(i)[1]) print 'user number of clustered = ', len(set(nau)), 'movie number of clustered = ', len(set(nam))
lines = list(list(int(i) for i in line if i) for line in lines) data = np.array(lines) pd_data = pd.DataFrame(data) pd_data.to_csv('lymphoma.csv') """ pd_data = pd.read_csv("lymphoma.csv", header=0, index_col=0) data = pd_data.values print type(data), data.shape # replace missing values, just as in the paper generator = np.random.RandomState(0) idx = np.where(data == 999) data[idx] = generator.randint(-800, 801, len(idx[0])) # cluster with same parameters as original paper model = ChengChurch(n_clusters=100, max_msr=1200, deletion_threshold=1.2, inverse_rows=True, random_state=0) model.fit(data) # find bicluster with smallest msr and plot it msr = lambda a: (np.power(a - a.mean(axis=1, keepdims=True) - a.mean(axis=0) + a.mean(), 2).mean()) msrs = list(msr(model.get_submatrix(i, data)) for i in range(100)) arr = model.get_submatrix(np.argmin(msrs), data) print type(arr), arr.shape df = DataFrame(arr) df["row"] = map(str, range(arr.shape[0])) parallel_coordinates(df, "row", linewidth=1.5) plt.xlabel("column") plt.ylabel("expression level") plt.gca().legend_ = None plt.show()