예제 #1
0
    def evalutate(self, memberships):
        groundTruth = self.groundTruth
        n_graphs = self.n_graphs
        individual_nmi = np.zeros([n_graphs])
        individual_ari = np.zeros([n_graphs])
        individual_mcr = np.zeros([n_graphs])
        for n in range(n_graphs):
            # print(n)
            individual_nmi[n] = nmi(memberships[n], groundTruth[n])
            individual_ari[n] = ari(memberships[n], groundTruth[n])
            individual_mcr[n] = mcr(memberships[n], groundTruth[n])

        trueMemberships_stacked = np.reshape(np.hstack(groundTruth), [-1])
        memberships_stacked = np.hstack(memberships)
        overall_nmi = nmi(memberships_stacked, trueMemberships_stacked)
        overall_ari = ari(memberships_stacked, trueMemberships_stacked)
        overall_mcr = mcr(memberships_stacked, trueMemberships_stacked)

        return {
            "NMI": {
                'nmi': np.mean(individual_nmi),
                'overall_nmi': overall_nmi
            },
            "ARI": {
                'ari': np.mean(individual_ari),
                'overall_ari': overall_ari
            },
            "MCR": {
                'mcr': np.mean(individual_mcr),
                'overall_mcr': overall_mcr
            }
        }
예제 #2
0
파일: main.py 프로젝트: xuyaokui/scBKAP
def clust(data_path, label_path, pca_com, phate_com):
    input_path = data_path + ".csv"
    label_path = label_path + ".csv"
    X = pd.read_csv(input_path, header=None)
    X = X.drop(0)
    X = np.array(X)
    X = X.transpose()

    pca = PCA(n_components=pca_com)
    b = pca.fit_transform(X)
    phate_op = phate.PHATE(n_components=phate_com)
    data_phate = phate_op.fit_transform(b)
    label = pd.read_csv(label_path)
    y = np.array(label)
    label = y.ravel()
    c = label.max()
    centList, clusterAssment = biKmeans(data_phate, c)
    julei = clusterAssment[:, 0]
    y = np.array(julei)
    julei = y.ravel()

    print('NMI value is %f \n' % nmi(julei.flatten(), label.flatten()))
    print('ARI value is %f \n' % ari(julei.flatten(), label.flatten()))
    print('HOM value is %f \n' % metrics.homogeneity_score(julei, label))
    print('AMI value is %f \n' %
          metrics.adjusted_mutual_info_score(label, julei))

    return julei
예제 #3
0
def training_and_testing():
    k = 40
    #X_train,y_train=data_collect_for_wefcm()
    X_train, X_test, y_train, y_test = data_collection_from_file()
    #print np.shape(X)
    #X_train,X_test,y_train,y_test=train_test_split(X,Y, test_size=0.2)
    start = timeit.default_timer()
    #U=ARWEFCM(X_train,k)
    print(np.shape(X_train))
    U, C = WEFCM(X_train, k)
    #X_test,y_test=data_collection_from_test_file()
    #this is ofcm
    #print "c start"
    #C=calculateClusterCenter(U,X_train,k)
    #my_df = pd.DataFrame(C)
    #my_df.to_csv('out.csv',index=False, header=False)
    #print C[0:2]
    #print "c end"
    y1_train = label_cluster(X_train, y_train, C)
    pre_labels = getpredicted_labels(U, len(y_train))
    stop = timeit.default_timer()
    print('run time:= ', stop - start)
    r1 = nmi(y_train, pre_labels)
    r2 = ari(y_train, pre_labels)
    print('NMI:= ', r1)
    print('ARI:= ', r2)
    #print y1_train
    #print len(X_train)
    y1_test = test_data(X_test, C, y1_train)
    #print C
    accuracy = (float(np.sum(y1_test == y_test))) / len(y_test)
    print('accuracy:= ', accuracy)
    #print(classification_report(y_test, y1_test, target_names=y_test))
    '''f = open("result1.ods","a+")
예제 #4
0
파일: demo.py 프로젝트: xuyaokui/scBKAP
"""

import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.metrics.cluster import adjusted_rand_score as ari
from sklearn.metrics.cluster import normalized_mutual_info_score as nmi

X = pd.read_csv('yan/yan.csv', header=None)
X = np.array(X)
X = X.transpose()

label = pd.read_csv('yan/yan_label.csv')
y = np.array(label)
label = y.ravel()

pca = PCA(n_components=2)
A = pca.fit_transform(X)

c = label.max()
kk = KMeans(n_clusters=c)
julei = kk.fit(A)
julei = julei.labels_

print('NMI value is %f \n' % nmi(julei.flatten(), label.flatten()))
print('ARI value is %f \n' % ari(julei.flatten(), label.flatten()))
print('HOM value is %f \n' % metrics.homogeneity_score(julei, label))
print('AMI value is %f \n' % metrics.adjusted_mutual_info_score(label, julei))
예제 #5
0
sca.cell_subset.shape
labels = sca.w.argmax(0)

from sklearn.metrics.cluster import normalized_mutual_info_score as nmi
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
labels_b = pd.read_csv('80k_cluster_numbers.csv')
labels_b = labels_b.iloc[:, 1]
labels_b = labels_b.values
labels_b_subset = labels_b[sca.cell_subset]
nmi(labels_b_subset, labels)

from sklearn.metrics.cluster import adjusted_rand_score as ari
ari(labels_b_subset, labels)

cluster_counts = np.zeros((len(set(labels_b)), len(set(labels))))
for i, j in zip(labels_b_subset, labels):
    cluster_counts[i, j] += 1

plt.figure(figsize=(10, 25))
sns.heatmap(cluster_counts / cluster_counts.sum(1)[:, np.newaxis],
            yticklabels=sorted(list(set(labels_b_subset))),
            vmin=0,
            vmax=1,
            linewidths=0.5)
plt.xlabel('UNCURL clusters')
plt.ylabel('Seurat clusters')
plt.title('SCH Cerebellum Clusters')
plt.savefig('uncurl_vs_seurat_clusters.png', dpi=200)
예제 #6
0
def run_experiment(methods,
                   data,
                   n_classes,
                   true_labels,
                   n_runs=10,
                   use_purity=True,
                   use_nmi=False,
                   use_ari=False,
                   use_nne=False,
                   consensus=False):
    """
    runs a pre-processing + clustering experiment...

    exactly one of use_purity, use_nmi, or use_ari can be true

    Args:
        methods: list of 2-tuples. The first element is either a single Preprocess object or a list of Preprocess objects, to be applied in sequence to the data. The second element is either a single Cluster object, a list of Cluster objects, or a list of lists, where each list is a sequence of Preprocess objects with the final element being a Cluster object.
        data: genes x cells array
        true_labels: 1d array of length cells
        consensus: if true, runs a consensus on cluster results for each method at the very end.
        use_purity, use_nmi, use_ari, use_nne: which error metric to use (at most one can be True)

    Returns:
        purities (list of lists)
        names (list of lists)
        other (dict): keys: timing, preprocessing, clusterings
    """
    results = []
    names = []
    clusterings = {}
    other_results = {}
    other_results['timing'] = {}
    other_results['preprocessing'] = {}
    if use_purity:
        purity_method = purity
    elif use_nmi:
        purity_method = nmi
    elif use_ari:
        purity_method = ari
    elif use_nne:
        purity_method = nne
    for i in range(n_runs):
        print('run {0}'.format(i))
        purities = []
        r = 0
        method_index = 0
        for preproc, cluster in methods:
            t0 = time.time()
            if isinstance(preproc, Preprocess):
                preprocessed, ll = preproc.run(data)
                output_names = preproc.output_names
            else:
                # if the input is a list, only use the first preproc result
                p1 = data
                output_names = ['']
                for p in preproc:
                    p1, ll = p.run(p1)
                    p1 = p1[0]
                    if output_names[0] != '':
                        output_names[
                            0] = output_names[0] + '_' + p.output_names[0]
                    else:
                        output_names[0] = p.output_names[0]
                preprocessed = [p1]
            t1 = time.time() - t0
            for name, pre in zip(output_names, preprocessed):
                starting_index = method_index
                if isinstance(cluster, Cluster):
                    #try:
                    t0 = time.time()
                    labels = cluster.run(pre)
                    t2 = t1 + time.time() - t0
                    if use_nne:
                        purities.append(purity_method(pre, true_labels))
                    else:
                        purities.append(purity_method(labels, true_labels))
                    if i == 0:
                        names.append(name + '_' + cluster.name)
                        clusterings[names[-1]] = []
                        other_results['timing'][names[-1]] = []
                    print(names[r])
                    clusterings[names[r]].append(labels)
                    print('time: ' + str(t2))
                    other_results['timing'][names[r]].append(t2)
                    print(purities[-1])
                    r += 1
                    method_index += 1
                #except:
                #    print('failed to do clustering')
                elif type(cluster) == list:
                    for c in cluster:
                        if isinstance(c, list):
                            t2 = t1
                            name2 = name
                            sub_data = pre.copy()
                            for subproc in c[:-1]:
                                t0 = time.time()
                                subproc_out, ll = subproc.run(sub_data)
                                sub_data = subproc_out[0]
                                name2 = name2 + '_' + subproc.output_names[0]
                                t2 += time.time() - t0
                            t0 = time.time()
                            labels = c[-1].run(sub_data)
                            t2 += time.time() - t0
                            if use_nne:
                                purities.append(
                                    purity_method(sub_data, true_labels))
                            else:
                                purities.append(
                                    purity_method(labels, true_labels))
                            if i == 0:
                                names.append(name2 + '_' + c[-1].name)
                                clusterings[names[-1]] = []
                                other_results['timing'][names[-1]] = []
                            print(names[r])
                            clusterings[names[r]].append(labels)
                            other_results['timing'][names[r]].append(t2)
                            print('time: ' + str(t2))
                            print(purities[-1])
                            r += 1
                            method_index += 1
                        else:
                            try:
                                t0 = time.time()
                                labels = c.run(pre)
                                t2 = t1 + time.time() - t0
                                if i == 0:
                                    names.append(name + '_' + c.name)
                                    clusterings[names[-1]] = []
                                    other_results['timing'][names[-1]] = []
                                if use_nne:
                                    purities.append(
                                        purity_method(pre, true_labels))
                                else:
                                    purities.append(
                                        purity_method(labels, true_labels))
                                print(names[r])
                                clusterings[names[r]].append(labels)
                                other_results['timing'][names[r]].append(t2)
                                print('time: ' + str(t2))
                                print(purities[-1])
                                r += 1
                                method_index += 1
                            except:
                                print('failed to do clustering')
                # find the highest purity for the pre-processing method
                # save the preprocessing result with the highest NMI
                num_clustering_results = method_index - starting_index
                clustering_results = purities[-num_clustering_results:]
                if i > 0 and len(clustering_results) > 0:
                    old_clustering_results = results[-1][
                        starting_index:method_index]
                    if max(old_clustering_results) < max(clustering_results):
                        other_results['preprocessing'][name] = pre
                else:
                    other_results['preprocessing'][name] = pre
        print('\t'.join(names))
        print('purities: ' + '\t'.join(map(str, purities)))
        results.append(purities)
    consensus_purities = []
    if consensus:
        other_results['consensus'] = {}
        k = len(np.unique(true_labels))
        for name, clusts in clusterings.items():
            print(name)
            clusts = np.vstack(clusts)
            consensus_clust = CE.cluster_ensembles(clusts,
                                                   verbose=False,
                                                   N_clusters_max=k)
            other_results['consensus'][name] = consensus_clust
            if use_purity:
                consensus_purity = purity(consensus_clust.flatten(),
                                          true_labels)
                print('consensus purity: ' + str(consensus_purity))
                consensus_purities.append(consensus_purity)
            if use_nmi:
                consensus_nmi = nmi(true_labels, consensus_clust)
                print('consensus NMI: ' + str(consensus_nmi))
                consensus_purities.append(consensus_nmi)
            if use_ari:
                consensus_ari = ari(true_labels, consensus_clust)
                print('consensus ARI: ' + str(consensus_ari))
                consensus_purities.append(consensus_ari)
        print('consensus results: ' + '\t'.join(map(str, consensus_purities)))
    other_results['clusterings'] = clusterings
    return results, names, other_results
예제 #7
0
from sklearn.metrics.cluster import adjusted_rand_score as ari
from scipy.sparse import csr_matrix

filename = 'Zeisel.mat'
X = csr_matrix(sio.loadmat(filename)['X'])  #loading single-cell RNA-seq data
X.data = np.log10(
    1 + X.data
)  ##take log transform of gene counts. This is very important since it makes the data more gaussian
label = sio.loadmat(filename)[
    'true_labs']  #this is ground-truth label for validation
c = label.max()  # number of clusters
### if the number of genes are more than 500, we recommend to perform pca first!
print('Start to Run PCA on the RNA-seq data!\n')
start_main = time.time()
if X.shape[1] > 500:
    X = SIMLR.helper.fast_pca(X, 500)
else:
    X = X.todense()
print('Successfully Run PCA! PCA took %f seconds in total\n' %
      (time.time() - start_main))
print('Start to Run SIMLR!\n')
start_main = time.time()
simlr = SIMLR.SIMLR_LARGE(c, 30, 0)
###This is how we initialize an object for SIMLR. the first input is number of rank (clusters) and the second input is number of neighbors. The third one is an binary indicator whether to use memory-saving mode. you can turn it on when the number of cells are extremely large to save some memory but with the cost of efficiency.
S, F, val, ind = simlr.fit(X)
print('Successfully Run SIMLR! SIMLR took %f seconds in total\n' %
      (time.time() - start_main))
y_pred = simlr.fast_minibatch_kmeans(F, c)
print('NMI value is %f \n' % nmi(y_pred.flatten(), label.flatten()))
print('ARI value is %f \n' % ari(y_pred.flatten(), label.flatten()))