示例#1
0
def evalute_(label_true, pred_labels):

    nmi_ = {}
    ari_ = {}

    for i, label in enumerate(pred_labels):
        nmi_[f'best{i+1}'] = NMI(label_true, label)
        ari_[f'best{i+1}'] = ARI(label_true, label)
    return nmi_, ari_
示例#2
0
def main():
    chrLength = read_chrlength(chrLengthPath)
    chrTotalLength = sum(chrLength)
    ngene = int(chrTotalLength / res) + 1

    start_time = time.time()

    totalCell, cellRelativePath = read_cell(contactMatrixPath)

    paras = [[cell, ngene, pad, rp] for cell in cellRelativePath]

    result = []

    for i in range(totalCell // ncpus + 1):
        with Pool(ncpus) as p:
            result += p.map(impute_cpu, paras[i * ncpus:i * ncpus + ncpus])

    index = {x[0]: j for j, x in enumerate(result)}
    Q_concat = np.array([result[index[x]][1] for x in cellRelativePath])
    del (result)  #free memory

    # needed to optimise for memory consumption
    if prct > -1:
        thres = np.percentile(Q_concat, 100 - prct, axis=1)
        Q_concat = (Q_concat > thres[:, None])

    pcaMatrix, varianceRatio = pca_reduce(Q_concat)
    # end to modify

    # save for protential later use
    np.save("pcaMatrix", pcaMatrix)
    np.savetxt("varianceRatio", varianceRatio)

    min_dim, max_dim = dicide_optimised_pcs(pcaMatrix)

    #because i want an reproducible result,so set random_state to 42
    reducer_umap = umap.UMAP(random_state=42)
    embedding_cluster = reducer_umap.fit_transform(pcaMatrix[:,
                                                             min_dim:max_dim])

    np.save("umapMatrix", embedding_cluster)

    #benchmark
    if (benchmark == True):
        label = read_label(contactMatrixPath)
        ari = ARI(label,
                  list(hdbscan.HDBSCAN().fit_predict(embedding_cluster)))

        with open('result.txt', 'w') as resFile:
            resFile.write("Benchmark mode, ARI is " + str(ari) + "\n")
            resFile.write("PCs selected are {} to {} \n".format(
                str(min_dim), str(max_dim)))
            end_time = time.time()
            resFile.write('Load and impute all cells with ' +
                          str(end_time - start_time) + ' seconds')
示例#3
0
def find_asso_nmi_ari(
    algo,
    pred_lab,
    reel_lab,
):

    association = total_association(pred_lab)
    algo.fit(association)
    asso_result = [
        NMI(reel_lab, algo.row_labels_),
        ARI(reel_lab, algo.rolabels_)
    ]

    return asso_result
                    else:
                        r = delta

                    delta = calculate_delta(l, r)
                    w = get_w(a[n], delta)
                    s1 = get_s(w)

                if er < 1e-9:
                    print('#iter', iter1)
                    break

            w_per[n] = w
            print('w:', w_per[n])

            print('ARI_sparse:',
                  ARI(cluster_assigned_per[n], load[:, load.shape[1] - 1]))
            #matrix[f,0] = ARI(cluster_assigned, load[:,load.shape[1]-1])

            kmeans = KMeans(n_clusters=k,
                            init='random',
                            max_iter=100,
                            n_init=1).fit(data)

            print('ARI_Kmeans:', ARI(kmeans.labels_, load[:,
                                                          load.shape[1] - 1]))
            #matrix[f,1] = ARI(kmeans.labels_, load[:,load.shape[1]-1])
            print()

        os = np.sum(a * w_per, axis=1)
        obs_mean = np.log10(os)[1:16].mean()
        gap_s = np.log10(os)[0] - obs_mean
示例#5
0
    for i in range(k):
        b[0] = cs_new[i]
        #print('b[0]:',b[0])
        MAAD_dist = np.zeros(data.shape[0])
        for j in range(data.shape[0]):
            b[1] = data[j]
            MAAD_dist[j] = MAAD(data, b) * (data.shape[0] - 2)
            #print(MAAD_dist[j])
        distance[:, i] = MAAD_dist
        #print('MAAD_dist:',MAAD_dist)

    cluster_assigned = np.argmin(np.array(distance), axis=1)
    print(cluster_assigned)
    cs_old = np.array(cs_new)
    #for j in range(k):
    res = minimize(fun=function, x0=cs_old, args=(load, cluster_assigned))
    cs_new = np.reshape(res.x, (k, data.shape[1]))
    er = np.linalg.norm(cs_new - cs_old)
    #print('er:',er)

    if er < 1e-9:
        print('#iter', iter1)
        break

print(cluster_assigned)

print('ARI_MAAD:', ARI(cluster_assigned, load[:, load.shape[1] - 1]))
kmeans = KMeans(n_clusters=k, init='random', max_iter=100, n_init=1).fit(data)
print('ARI_Kmeans:', ARI(kmeans.labels_, load[:, load.shape[1] - 1]))
print()
示例#6
0
def main():
    chrLength = read_chrlength(chrLengthPath)
    chrTotalLength = sum(chrLength)
    ngene = int(chrTotalLength / res) + 1

    start_time = time.time()

    totalCell, cellRelativePath = read_cell(contactMatrixPath)

    paras = [[cell, ngene, pad, rp] for cell in cellRelativePath]

    result = []

    for i in range(totalCell // ncpus + 1):
        with Pool(ncpus) as p:
            result += p.map(impute_cpu, paras[i * ncpus:i * ncpus + ncpus])

    index = {x[0]: j for j, x in enumerate(result)}
    Q_concat = np.array([result[index[x]][1] for x in cellRelativePath])
    del (result)  #free memory

    pcaMatrix, varianceRatio = pca_reduce(Q_concat)

    # save for protential later use
    np.save("pcaMatrix", pcaMatrix)
    np.savetxt("pcaMatrix", pcaMatrix)
    np.savetxt("varianceRatio", varianceRatio)

    min_dim, max_dim = dicide_optimised_pcs(pcaMatrix)

    #because i want an reproducible result,so set random_state to 42
    reducer_umap = umap.UMAP(random_state=42)
    embedding_cluster = reducer_umap.fit_transform(pcaMatrix[:,
                                                             min_dim:max_dim])

    np.save("umapMatrix", embedding_cluster)
    np.savetxt("umapMatrix", embedding_cluster)

    #save clustering results to tsv
    cellnames = []
    for i in sorted(list(listdir_nohidden("./contactMatrix/"))):
        cellnames.append(i.strip('.conmat'))
    clusteringRes = pd.DataFrame(
        np.array([
            cellnames,
            list(hdbscan.HDBSCAN().fit_predict(embedding_cluster))
        ])).T
    clusteringRes.columns = ['pairsFile', 'cluster']
    clusteringRes.to_csv("clusterRes.tsv", sep='\t')

    #benchmark
    if (benchmark == True):
        label = read_label(contactMatrixPath)
        ari = ARI(label,
                  list(hdbscan.HDBSCAN().fit_predict(embedding_cluster)))

        with open('result.txt', 'w') as resFile:
            resFile.write("Benchmark mode, ARI is " + str(ari) + "\n")
            resFile.write("PCs selected are {} to {} \n".format(
                str(min_dim), str(max_dim)))
            end_time = time.time()
            resFile.write('Load and impute all cells with ' +
                          str(end_time - start_time) + ' seconds')
    else:
        with open('result.txt', 'w') as resFile:
            resFile.write("PCs selected are {} to {} \n".format(
                str(min_dim), str(max_dim)))
            end_time = time.time()
            resFile.write('Load and impute all cells with ' +
                          str(end_time - start_time) + ' seconds')
示例#7
0
# CpG content for each bin
cg = np.loadtxt('hg19/bin/hg19.1mb.bin.CpG.txt',
                dtype=np.str,
                skiprows=1,
                usecols=(0, 9, 11, 12))
cgdata = cg[:, 1:].astype(float)
cgdata = cgdata[:, 2] / (cgdata[:, 1] - cgdata[:, 0])
cgdata[np.isnan(cgdata)] = 0.0
chrcg = {c: cgdata[cg[:, 0] == 'chr' + c] for c in chrom}

# scHiCluster GPU
start_time = time.time()
cluster, embedding = hicluster_gpu(network, chromsize, nc=nc)
print(time.time() - start_time)
[
    ARI(label,
        KMeans(n_clusters=nc, n_init=200).fit(embedding[:, :ndim]).labels_)
    for ndim in [2, 5, 10, 20, 50]
]

# scHiCluster CPU
start_time = time.time()
cluster, embedding = hicluster_cpu(network, chromsize, nc=nc, ncpus=5)
print(time.time() - start_time)
[
    ARI(label,
        KMeans(n_clusters=nc, n_init=200).fit(embedding[:, :ndim]).labels_)
    for ndim in [2, 5, 10, 20, 50]
]

# PCA
start_time = time.time()