def evalute_(label_true, pred_labels): nmi_ = {} ari_ = {} for i, label in enumerate(pred_labels): nmi_[f'best{i+1}'] = NMI(label_true, label) ari_[f'best{i+1}'] = ARI(label_true, label) return nmi_, ari_
def main(): chrLength = read_chrlength(chrLengthPath) chrTotalLength = sum(chrLength) ngene = int(chrTotalLength / res) + 1 start_time = time.time() totalCell, cellRelativePath = read_cell(contactMatrixPath) paras = [[cell, ngene, pad, rp] for cell in cellRelativePath] result = [] for i in range(totalCell // ncpus + 1): with Pool(ncpus) as p: result += p.map(impute_cpu, paras[i * ncpus:i * ncpus + ncpus]) index = {x[0]: j for j, x in enumerate(result)} Q_concat = np.array([result[index[x]][1] for x in cellRelativePath]) del (result) #free memory # needed to optimise for memory consumption if prct > -1: thres = np.percentile(Q_concat, 100 - prct, axis=1) Q_concat = (Q_concat > thres[:, None]) pcaMatrix, varianceRatio = pca_reduce(Q_concat) # end to modify # save for protential later use np.save("pcaMatrix", pcaMatrix) np.savetxt("varianceRatio", varianceRatio) min_dim, max_dim = dicide_optimised_pcs(pcaMatrix) #because i want an reproducible result,so set random_state to 42 reducer_umap = umap.UMAP(random_state=42) embedding_cluster = reducer_umap.fit_transform(pcaMatrix[:, min_dim:max_dim]) np.save("umapMatrix", embedding_cluster) #benchmark if (benchmark == True): label = read_label(contactMatrixPath) ari = ARI(label, list(hdbscan.HDBSCAN().fit_predict(embedding_cluster))) with open('result.txt', 'w') as resFile: resFile.write("Benchmark mode, ARI is " + str(ari) + "\n") resFile.write("PCs selected are {} to {} \n".format( str(min_dim), str(max_dim))) end_time = time.time() resFile.write('Load and impute all cells with ' + str(end_time - start_time) + ' seconds')
def find_asso_nmi_ari( algo, pred_lab, reel_lab, ): association = total_association(pred_lab) algo.fit(association) asso_result = [ NMI(reel_lab, algo.row_labels_), ARI(reel_lab, algo.rolabels_) ] return asso_result
else: r = delta delta = calculate_delta(l, r) w = get_w(a[n], delta) s1 = get_s(w) if er < 1e-9: print('#iter', iter1) break w_per[n] = w print('w:', w_per[n]) print('ARI_sparse:', ARI(cluster_assigned_per[n], load[:, load.shape[1] - 1])) #matrix[f,0] = ARI(cluster_assigned, load[:,load.shape[1]-1]) kmeans = KMeans(n_clusters=k, init='random', max_iter=100, n_init=1).fit(data) print('ARI_Kmeans:', ARI(kmeans.labels_, load[:, load.shape[1] - 1])) #matrix[f,1] = ARI(kmeans.labels_, load[:,load.shape[1]-1]) print() os = np.sum(a * w_per, axis=1) obs_mean = np.log10(os)[1:16].mean() gap_s = np.log10(os)[0] - obs_mean
for i in range(k): b[0] = cs_new[i] #print('b[0]:',b[0]) MAAD_dist = np.zeros(data.shape[0]) for j in range(data.shape[0]): b[1] = data[j] MAAD_dist[j] = MAAD(data, b) * (data.shape[0] - 2) #print(MAAD_dist[j]) distance[:, i] = MAAD_dist #print('MAAD_dist:',MAAD_dist) cluster_assigned = np.argmin(np.array(distance), axis=1) print(cluster_assigned) cs_old = np.array(cs_new) #for j in range(k): res = minimize(fun=function, x0=cs_old, args=(load, cluster_assigned)) cs_new = np.reshape(res.x, (k, data.shape[1])) er = np.linalg.norm(cs_new - cs_old) #print('er:',er) if er < 1e-9: print('#iter', iter1) break print(cluster_assigned) print('ARI_MAAD:', ARI(cluster_assigned, load[:, load.shape[1] - 1])) kmeans = KMeans(n_clusters=k, init='random', max_iter=100, n_init=1).fit(data) print('ARI_Kmeans:', ARI(kmeans.labels_, load[:, load.shape[1] - 1])) print()
def main(): chrLength = read_chrlength(chrLengthPath) chrTotalLength = sum(chrLength) ngene = int(chrTotalLength / res) + 1 start_time = time.time() totalCell, cellRelativePath = read_cell(contactMatrixPath) paras = [[cell, ngene, pad, rp] for cell in cellRelativePath] result = [] for i in range(totalCell // ncpus + 1): with Pool(ncpus) as p: result += p.map(impute_cpu, paras[i * ncpus:i * ncpus + ncpus]) index = {x[0]: j for j, x in enumerate(result)} Q_concat = np.array([result[index[x]][1] for x in cellRelativePath]) del (result) #free memory pcaMatrix, varianceRatio = pca_reduce(Q_concat) # save for protential later use np.save("pcaMatrix", pcaMatrix) np.savetxt("pcaMatrix", pcaMatrix) np.savetxt("varianceRatio", varianceRatio) min_dim, max_dim = dicide_optimised_pcs(pcaMatrix) #because i want an reproducible result,so set random_state to 42 reducer_umap = umap.UMAP(random_state=42) embedding_cluster = reducer_umap.fit_transform(pcaMatrix[:, min_dim:max_dim]) np.save("umapMatrix", embedding_cluster) np.savetxt("umapMatrix", embedding_cluster) #save clustering results to tsv cellnames = [] for i in sorted(list(listdir_nohidden("./contactMatrix/"))): cellnames.append(i.strip('.conmat')) clusteringRes = pd.DataFrame( np.array([ cellnames, list(hdbscan.HDBSCAN().fit_predict(embedding_cluster)) ])).T clusteringRes.columns = ['pairsFile', 'cluster'] clusteringRes.to_csv("clusterRes.tsv", sep='\t') #benchmark if (benchmark == True): label = read_label(contactMatrixPath) ari = ARI(label, list(hdbscan.HDBSCAN().fit_predict(embedding_cluster))) with open('result.txt', 'w') as resFile: resFile.write("Benchmark mode, ARI is " + str(ari) + "\n") resFile.write("PCs selected are {} to {} \n".format( str(min_dim), str(max_dim))) end_time = time.time() resFile.write('Load and impute all cells with ' + str(end_time - start_time) + ' seconds') else: with open('result.txt', 'w') as resFile: resFile.write("PCs selected are {} to {} \n".format( str(min_dim), str(max_dim))) end_time = time.time() resFile.write('Load and impute all cells with ' + str(end_time - start_time) + ' seconds')
# CpG content for each bin cg = np.loadtxt('hg19/bin/hg19.1mb.bin.CpG.txt', dtype=np.str, skiprows=1, usecols=(0, 9, 11, 12)) cgdata = cg[:, 1:].astype(float) cgdata = cgdata[:, 2] / (cgdata[:, 1] - cgdata[:, 0]) cgdata[np.isnan(cgdata)] = 0.0 chrcg = {c: cgdata[cg[:, 0] == 'chr' + c] for c in chrom} # scHiCluster GPU start_time = time.time() cluster, embedding = hicluster_gpu(network, chromsize, nc=nc) print(time.time() - start_time) [ ARI(label, KMeans(n_clusters=nc, n_init=200).fit(embedding[:, :ndim]).labels_) for ndim in [2, 5, 10, 20, 50] ] # scHiCluster CPU start_time = time.time() cluster, embedding = hicluster_cpu(network, chromsize, nc=nc, ncpus=5) print(time.time() - start_time) [ ARI(label, KMeans(n_clusters=nc, n_init=200).fit(embedding[:, :ndim]).labels_) for ndim in [2, 5, 10, 20, 50] ] # PCA start_time = time.time()