示例#1
0
    def evalutate(self, memberships):
        groundTruth = self.groundTruth
        n_graphs = self.n_graphs
        individual_nmi = np.zeros([n_graphs])
        individual_ari = np.zeros([n_graphs])
        individual_mcr = np.zeros([n_graphs])
        for n in range(n_graphs):
            # print(n)
            individual_nmi[n] = nmi(memberships[n], groundTruth[n])
            individual_ari[n] = ari(memberships[n], groundTruth[n])
            individual_mcr[n] = mcr(memberships[n], groundTruth[n])

        trueMemberships_stacked = np.reshape(np.hstack(groundTruth), [-1])
        memberships_stacked = np.hstack(memberships)
        overall_nmi = nmi(memberships_stacked, trueMemberships_stacked)
        overall_ari = ari(memberships_stacked, trueMemberships_stacked)
        overall_mcr = mcr(memberships_stacked, trueMemberships_stacked)

        return {
            "NMI": {
                'nmi': np.mean(individual_nmi),
                'overall_nmi': overall_nmi
            },
            "ARI": {
                'ari': np.mean(individual_ari),
                'overall_ari': overall_ari
            },
            "MCR": {
                'mcr': np.mean(individual_mcr),
                'overall_mcr': overall_mcr
            }
        }
 def test_leiden(self):
     m, w, ll = uncurl.run_state_estimation(self.data_subset,
                                            8,
                                            max_iters=20,
                                            inner_max_iters=50)
     print('nmi basic: ' + str(nmi(self.labels, w.argmax(0))))
     g = clustering_methods.create_graph(w.T, metric='cosine')
     leiden_clustering = clustering_methods.run_leiden(g)
     self.assertTrue(nmi(self.labels, leiden_clustering) >= 0.7)
     louvain_clustering = clustering_methods.run_louvain(g)
     self.assertTrue(nmi(self.labels, louvain_clustering) >= 0.7)
示例#3
0
def test_clustering(n_runs=20, alpha=0.5):
    nmis_both = []
    nmis_attributes = []
    nmis_structure = []
    for i in range(n_runs):
        print("Run number {0}".format(i))
        ensemble_density_huge('file.csv', "'\t'")
        dist_dense = pd.read_csv("./matrix.csv", delimiter="\t",
                                 header=None).values
        dist_dense = dist_dense[:, :-1]

        sims_attributes = ensemble_attributes("file_attributes.csv", "\t")
        sim_attributes = pd.read_csv("./matrix_uet.csv",
                                     delimiter="\t",
                                     header=None).values
        sim_attributes = sim_attributes[:, :-1]

        dist_attributes = sim_to_dist(np.array(sim_attributes))
        dist = alpha * dist_dense + (1 - alpha) * dist_attributes
        dist = dist / 2
        model_kmeans = KMeans(n_clusters=len(set(true)))
        scaler = QuantileTransformer(n_quantiles=10)
        dist_scaled = scaler.fit_transform(dist)
        dist_dense_scaled = scaler.fit_transform(dist_dense)
        dist_attributes_scaled = scaler.fit_transform(dist_attributes)
        results_dense = TSNE(
            metric="precomputed").fit_transform(dist_dense_scaled)

        results_dense_both = TSNE(
            metric="precomputed").fit_transform(dist_scaled)
        results_dense_attributes = TSNE(
            metric="precomputed").fit_transform(dist_attributes_scaled)
        labels_dense_kmeans_both = model_kmeans.fit_predict(results_dense_both)
        labels_dense_kmeans_attributes = model_kmeans.fit_predict(
            results_dense_attributes)
        labels_dense_kmeans_structure = model_kmeans.fit_predict(results_dense)

        nmis_both.append(
            nmi(labels_dense_kmeans_both, true, average_method="arithmetic"))
        nmis_attributes.append(
            nmi(labels_dense_kmeans_attributes,
                true,
                average_method="arithmetic"))
        nmis_structure.append(
            nmi(labels_dense_kmeans_structure,
                true,
                average_method="arithmetic"))
    print("Structure : {0}, {1}".format(np.mean(nmis_structure),
                                        np.std(nmis_structure)))
    print("Attributes : {0}, {1}".format(np.mean(nmis_attributes),
                                         np.std(nmis_attributes)))
    print("Both : {0}, {1}".format(np.mean(nmis_both), np.std(nmis_both)))

    return (nmis_structure, nmis_attributes, nmis_both)
示例#4
0
 def test_run_uncurl(self):
     sca = sc_analysis.SCAnalysis(self.data_dir,
                                  clusters=8,
                                  frac=0.2,
                                  data_filename='data.mtx',
                                  max_iters=20,
                                  inner_max_iters=50)
     sca.run_uncurl()
     self.assertTrue(sca.has_w)
     self.assertTrue(sca.has_m)
     self.assertTrue(sca.w.shape[0] == 8)
     self.assertTrue(sca.w.shape[1] == self.data.shape[1])
     self.assertTrue(os.path.exists(sca.w_f))
     self.assertTrue(os.path.exists(sca.m_f))
     print(nmi(sca.labels, self.labs))
     self.assertTrue(nmi(sca.labels, self.labs) > 0.65)
示例#5
0
 def test_add_color_track(self):
     sca = sc_analysis.SCAnalysis(self.data_dir,
                                  frac=0.2,
                                  clusters=8,
                                  data_filename='data.mtx',
                                  baseline_dim_red='tsvd',
                                  dim_red_option='MDS',
                                  clustering_method='leiden',
                                  cell_frac=1.0,
                                  max_iters=20,
                                  inner_max_iters=10)
     sca.add_color_track('true_labels', self.labs, is_discrete=True)
     true_labels, is_discrete = sca.get_color_track('true_labels')
     self.assertTrue(nmi(true_labels, self.labs) > 0.99)
     top_genes, top_pvals = sca.calculate_diffexp('true_labels')
     self.assertEqual(len(top_genes), 8)
     self.assertEqual(len(top_pvals), 8)
     sca.add_color_track('true_labels_2', self.labs, is_discrete=False)
     true_labels_2, _ = sca.get_color_track('true_labels_2')
     self.assertTrue((true_labels_2.astype(int) == self.labs).all())
     pairwise_genes, pairwise_pvals = sca.calculate_diffexp('true_labels',
                                                            mode='pairwise')
     self.assertEqual(pairwise_genes.shape, pairwise_pvals.shape)
     pairwise_genes, pairwise_pvals = sca.calculate_diffexp('true_labels',
                                                            mode='pairwise')
     self.assertEqual(pairwise_genes.shape, pairwise_pvals.shape)
     self.assertEqual(pairwise_genes.shape[0], 8)
     top_genes, top_pvals = sca.calculate_diffexp('true_labels')
     self.assertEqual(len(top_genes[0]), len(sca.gene_names))
     self.assertEqual(len(top_genes), 8)
     self.assertEqual(len(top_pvals), 8)
def check_clusterpurity(f, cluster, dataset, gnd_label, sys_label):
    """
    check cluster purity with respect to xvectors belonging to single speakers
    """
    clusterpurity = []
    clustervar = []
    fullclasslabel = []
    clean_ind = []
    for c in cluster:
        classlabel = []
        clustervar.append(np.var(c))
        for a in c:
            if len(gnd_label[a]) == 1:
                classlabel.append(gnd_label[a][0])
                clean_ind.append(a)
            # else:
            #     sys_label = np.delete(sys_label,a)
        fullclasslabel.extend(classlabel)
        classlabel = np.array(classlabel)

        if len(classlabel) == 0:
            clusterpurity.append(0)
            continue
        unilabel = mostFrequent(classlabel, len(classlabel))
        purity = (len(np.where(classlabel == unilabel)[0]) /
                  len(classlabel)) * 100

        clusterpurity.append(purity)
    sys_label = sys_label[clean_ind]
    Nmi_score = nmi(fullclasslabel, sys_label.tolist())
    print('NMI score for n_cluster:{} is {}'.format(len(cluster), Nmi_score))
    print('cluster purity for n_cluster:{} is {}'.format(
        len(cluster), clusterpurity))
示例#7
0
文件: main.py 项目: xuyaokui/scBKAP
def clust(data_path, label_path, pca_com, phate_com):
    input_path = data_path + ".csv"
    label_path = label_path + ".csv"
    X = pd.read_csv(input_path, header=None)
    X = X.drop(0)
    X = np.array(X)
    X = X.transpose()

    pca = PCA(n_components=pca_com)
    b = pca.fit_transform(X)
    phate_op = phate.PHATE(n_components=phate_com)
    data_phate = phate_op.fit_transform(b)
    label = pd.read_csv(label_path)
    y = np.array(label)
    label = y.ravel()
    c = label.max()
    centList, clusterAssment = biKmeans(data_phate, c)
    julei = clusterAssment[:, 0]
    y = np.array(julei)
    julei = y.ravel()

    print('NMI value is %f \n' % nmi(julei.flatten(), label.flatten()))
    print('ARI value is %f \n' % ari(julei.flatten(), label.flatten()))
    print('HOM value is %f \n' % metrics.homogeneity_score(julei, label))
    print('AMI value is %f \n' %
          metrics.adjusted_mutual_info_score(label, julei))

    return julei
示例#8
0
    def eval_cluster_on_test(self):

        # Embedding points in the test data to the latent space
        inp_encoder = self.data_test
        latent_matrix = self.sess.run(self.z,
                                      feed_dict={
                                          self.x_input: inp_encoder,
                                          self.keep_prob: 1.0
                                      })

        labels = self.labels_test
        K = np.size(np.unique(labels))
        kmeans = KMeans(n_clusters=K, random_state=0).fit(latent_matrix)
        y_pred = kmeans.labels_

        print('Computing NMI ...')
        NMI = nmi(labels.flatten(), y_pred.flatten())
        print('Done !')

        print('NMI = {}'.format(NMI))

        if not os.path.exists('Res_DRA/tune_logs'):
            os.makedirs('Res_DRA/tune_logs')

        out_file_name = 'Res_DRA/tune_logs/Metrics_{}.txt'.format(
            self.dataset_name)
        f = open(out_file_name, 'a')

        f.write('\n{}, NMI = {}'.format(self.model_dir, NMI))

        f.close()
示例#9
0
def training_and_testing():
    k = 40
    #X_train,y_train=data_collect_for_wefcm()
    X_train, X_test, y_train, y_test = data_collection_from_file()
    #print np.shape(X)
    #X_train,X_test,y_train,y_test=train_test_split(X,Y, test_size=0.2)
    start = timeit.default_timer()
    #U=ARWEFCM(X_train,k)
    print(np.shape(X_train))
    U, C = WEFCM(X_train, k)
    #X_test,y_test=data_collection_from_test_file()
    #this is ofcm
    #print "c start"
    #C=calculateClusterCenter(U,X_train,k)
    #my_df = pd.DataFrame(C)
    #my_df.to_csv('out.csv',index=False, header=False)
    #print C[0:2]
    #print "c end"
    y1_train = label_cluster(X_train, y_train, C)
    pre_labels = getpredicted_labels(U, len(y_train))
    stop = timeit.default_timer()
    print('run time:= ', stop - start)
    r1 = nmi(y_train, pre_labels)
    r2 = ari(y_train, pre_labels)
    print('NMI:= ', r1)
    print('ARI:= ', r2)
    #print y1_train
    #print len(X_train)
    y1_test = test_data(X_test, C, y1_train)
    #print C
    accuracy = (float(np.sum(y1_test == y_test))) / len(y_test)
    print('accuracy:= ', accuracy)
    #print(classification_report(y_test, y1_test, target_names=y_test))
    '''f = open("result1.ods","a+")
示例#10
0
def graph_learning_perf_eval(L_orig, L_pred):
    """
    L_orig : groundtruth graph Laplacian
    L_pred : learned graph Laplacian
    """
    # evaluate the performance of graph learning algorithms
    
    n = L_orig.shape[0]
    idx_non_diag = np.triu_indices(n, 1) # excluding diagonal
    
    L_orig_nd = np.diag(np.diag(L_orig)) - L_orig
    edges_groundtruth = (L_orig_nd > 1e-4)[idx_non_diag] + 0

    L_pred_nd = np.diag(np.diag(L_pred)) - L_pred
    edges_learned = (L_pred_nd > 1e-4)[idx_non_diag] + 0
        
    condition_positive = np.sum(edges_groundtruth)
    prediction_positive = np.sum(edges_learned)
    true_positive = np.sum(np.logical_and(edges_groundtruth, edges_learned))
    print(f"condition positive:{condition_positive}, prediction positive:{prediction_positive}, true_positive:{true_positive}")
    
    precision = true_positive / prediction_positive
    recall = true_positive / condition_positive
    
    if precision == 0 or recall == 0:
        f = 0
    else:
        f = 2 * precision * recall / (precision + recall)
    
    NMI = nmi(edges_groundtruth, edges_learned)
    
    R, _ = pearsonr(L_orig[idx_non_diag], L_pred[idx_non_diag])

    return precision, recall, f, NMI, R
示例#11
0
 def testSparsePoissonLookup(self):
     data = self.data
     labels = []
     for i in range(data.shape[1]):
         cell = data[:,i]
         scores = bulk_lookup(self.bulk_means, cell)
         labels.append(scores[0][0])
     nmi_val = nmi(self.labs, labels)
     self.assertTrue(nmi_val > 0.99)
示例#12
0
 def testCorrLookup(self):
     data_dense = self.data_dense
     labels = []
     for i in range(data_dense.shape[1]):
         cell = data_dense[:,i]
         scores = bulk_lookup(self.bulk_means, cell, method='corr')
         labels.append(scores[0][0])
     nmi_val = nmi(self.labs, labels)
     self.assertTrue(nmi_val > 0.85)
示例#13
0
 def test_merge(self):
     # create distance matrix
     # find the min distance between two cluster pairs
     m = self.m
     w = self.w
     data_subset = self.data_subset
     clusters_to_merge = [0, 1, 2]
     # merge the min distance pair
     m_merge, w_merge = relabeling.merge_clusters(data_subset,
                                                  m,
                                                  w,
                                                  clusters_to_merge,
                                                  max_iters=20,
                                                  inner_max_iters=50)
     nmi_base = nmi(self.labels, w.argmax(0))
     nmi_merge = nmi(self.labels, w_merge.argmax(0))
     print('nmi after merging the closest pairs: ' + str(nmi_merge))
     self.assertTrue(nmi_merge >= nmi_base - 0.3)
     self.assertEqual(w_merge.shape[0], w.shape[0] - 2)
def eval_clustering(y_true, y_pred):
    _, y_true = np.unique(y_true, return_inverse=True)
    _, y_pred = np.unique(y_pred, return_inverse=True)

    acc_score = accuracy_clustering(y_true, y_pred)
    pu_score = purity(y_true, y_pred)
    nmi_score = nmi(y_true, y_pred,
                    average_method='geometric')  # average_method='arithmetic'
    ri_score = ri(y_true, y_pred)
    return acc_score, pu_score, nmi_score, ri_score
示例#15
0
 def test_json(self):
     sca = sc_analysis.SCAnalysis(self.data_dir,
                                  frac=0.2,
                                  clusters=8,
                                  data_filename='data.mtx',
                                  baseline_dim_red='tsvd',
                                  dim_red_option='umap',
                                  normalize=1,
                                  cell_frac=0.5,
                                  max_iters=20,
                                  inner_max_iters=20,
                                  use_fdr=1)
     sca.run_full_analysis()
     sca.save_json_reset()
     # delete the whole sca, re-load it from json
     del sca
     sca = sc_analysis.SCAnalysis(self.data_dir)
     sca = sca.load_params_from_folder()
     self.assertEqual(sca.params['clusters'], 8)
     self.assertEqual(sca.params['baseline_dim_red'], 'tsvd')
     self.assertEqual(sca.params['dim_red_option'], 'umap')
     self.assertEqual(sca.params['cell_frac'], 0.5)
     self.assertEqual(sca.params['genes_frac'], 0.2)
     self.assertTrue(sca.params['normalize'])
     self.assertTrue(sca.params['use_fdr'])
     self.assertEqual(sca.uncurl_kwargs['max_iters'], 20)
     self.assertTrue(sca.has_dim_red)
     self.assertTrue(sca.has_w)
     self.assertTrue(sca.has_m)
     self.assertEqual(sca.cell_subset.shape[0], 400)
     means = sca.cluster_means
     self.assertEqual(means.shape[1], 8)
     self.assertEqual(means.shape[0], self.data.shape[0])
     # TODO: do re-clustering
     sca.add_color_track('true_labels', self.labs, is_discrete=True)
     old_labels = sca.labels
     sca.relabel('louvain')
     self.assertFalse((old_labels == sca.labels).all())
     true_labels, is_discrete = sca.get_color_track('true_labels')
     self.assertTrue(nmi(sca.labels, true_labels) > 0.65)
     sca.relabel('leiden')
     self.assertTrue(nmi(sca.labels, true_labels) > 0.65)
示例#16
0
def test_clustering_structure(n_runs=20):
    nmis_gt = []
    nmis_mcl = []
    nmis_louvain = []
    for i in range(n_runs):
        print("Run number {0}".format(i))
        ensemble_density_huge("file.csv", "\\t")
        dist_dense = pd.read_csv("./matrix.csv", delimiter="\t",
                                 header=None).values
        dist_dense = dist_dense[:, :-1]
        scaler = QuantileTransformer(n_quantiles=10)
        dist_dense_scaled = scaler.fit_transform(dist_dense)
        results_dense = TSNE(
            metric="precomputed").fit_transform(dist_dense_scaled)
        model_kmeans = KMeans(n_clusters=len(set(true)))
        labels_dense_kmeans = model_kmeans.fit_predict(results_dense)
        clusters_mcl = [0 for i in range(len(adj))]
        result_mcl = mc.run_mcl(adj)  # run MCL with default parameters
        clusters = mc.get_clusters(result_mcl)  # get clusters
        i = 0
        for cluster in clusters:
            for j in cluster:
                clusters_mcl[j] = i
            i += 1

        partition = louvain.best_partition(G)
        labels_spectral = [v for k, v in partition.items()]

        nmis_gt.append(
            nmi(labels_dense_kmeans, true, average_method="arithmetic"))
        nmis_mcl.append(nmi(clusters_mcl, true, average_method="arithmetic"))
        nmis_louvain.append(
            nmi(labels_spectral, true, average_method="arithmetic"))
    print("GT : {0}, {1}".format(np.mean(nmis_gt), np.std(nmis_gt)))
    print("MCL : {0}, {1}".format(np.mean(nmis_mcl), np.std(nmis_mcl)))
    print("Louvain : {0}, {1}".format(np.mean(nmis_louvain),
                                      np.std(nmis_louvain)))
    return ((nmis_gt, nmis_mcl, nmis_louvain))
示例#17
0
    def test_split(self):
        # 5. building a distance matrix between clusters, find closest pair

        # 6. run split_cluster - split the largest cluster
        m = self.m
        w = self.w
        data_subset = self.data_subset
        labels = self.labels
        clusters = w.argmax(0)
        cluster_counts = Counter(clusters)
        top_cluster, top_count = cluster_counts.most_common()[0]

        m_split, w_split = relabeling.split_cluster(data_subset,
                                                    m,
                                                    w,
                                                    top_cluster,
                                                    max_iters=20,
                                                    inner_max_iters=50)
        nmi_base = nmi(labels, w.argmax(0))
        nmi_split = nmi(labels, w_split.argmax(0))
        print('nmi after splitting the largest cluster: ' + str(nmi_split))
        self.assertTrue(nmi_split >= nmi_base - 0.02)
        self.assertEqual(w_split.shape[0], w.shape[0] + 1)
示例#18
0
 def test_new(self):
     """
     Tests creating a new cluster from a selection of cells
     """
     data_subset = self.data_subset
     m = self.m
     w = self.w
     selected_cells = list(range(375, w.shape[1]))
     m_new, w_new = relabeling.new_cluster(data_subset,
                                           m,
                                           w,
                                           selected_cells,
                                           max_iters=20,
                                           inner_max_iters=50)
     nmi_base = nmi(self.labels, w.argmax(0))
     nmi_new = nmi(self.labels, w_new.argmax(0))
     self.assertTrue(w_new.shape[0] == 9)
     print('nmi after creating a new cluster: ' + str(nmi_new))
     self.assertTrue(nmi_new >= nmi_base - 0.1)
     self.assertEqual(w_new.shape[0], w.shape[0] + 1)
     self.assertTrue(
         sum((w_new.argmax(0)[selected_cells] == 8
              )) >= len(selected_cells) / 2)
示例#19
0
    def setUp(self):
        data = scipy.io.loadmat('data/10x_pooled_400.mat')

        data_csc = data['data']
        self.labels = data['labels'].flatten()
        #gene_names = data['gene_names']

        # 2. gene selection
        genes = uncurl.max_variance_genes(data_csc)
        self.data_subset = data_csc[genes, :]
        #gene_names_subset = gene_names[genes]

        # 3. run uncurl
        m, w, ll = uncurl.run_state_estimation(self.data_subset,
                                               8,
                                               max_iters=20,
                                               inner_max_iters=50)
        print('nmi basic: ' + str(nmi(self.labels, w.argmax(0))))
        self.m = m
        self.w = w
示例#20
0
 def test_10x_auto_cluster(self):
     """
     Test using automatic cluster size determination
     """
     from sklearn.metrics.cluster import normalized_mutual_info_score as nmi
     # gene selection
     genes = uncurl.max_variance_genes(self.data)
     data_subset = self.data[genes, :]
     # smaller # of iterations than default so it finishes faster...
     M, W, ll = uncurl.run_state_estimation(data_subset,
                                            clusters=0,
                                            max_iters=10,
                                            inner_max_iters=80)
     labels = W.argmax(0)
     # NMI should be > 0.75 on 10x_pure_pooled
     # (accounting for lower than default iter count)
     self.assertTrue(nmi(self.labs, labels) > 0.6)
     # test RMSE
     test_data = np.dot(M, W)
     error = data_subset.toarray() - test_data
     error = np.sqrt(np.mean(error**2))
     print('data subset RMSE:', error)
     self.assertTrue(error < 2.0)
示例#21
0
                               n_col_clusters=4,
                               max_iter=100)
model_2._fit_single(X, random_state=None)

# In[5]:

model_2.fit(X)

# In[8]:

model_2.row_labels_

# In[9]:

predicted_labels_2 = model_2.row_labels_
print(nmi(true_labels, predicted_labels_2), acc(true_labels,
                                                predicted_labels_2),
      ars(true_labels, predicted_labels_2),
      amis(true_labels, predicted_labels_2))

# In[11]:

model_5 = NMTFcoclus_ONM3F.ONM3F(n_row_clusters=4, n_col_clusters=4)
model_5.fit(X)

# In[15]:

predicted_labels_5 = model_5.row_labels_
print(nmi(true_labels, predicted_labels_5), acc(true_labels,
                                                predicted_labels_5),
      ars(true_labels, predicted_labels_5),
示例#22
0
# -*- coding: utf-8 -*-
"""
Created on Fri Mar  2 19:00:21 2018

@author: jimmybow
"""

from pyclustering.utils import read_sample
from pyclustering.samples.definitions import FCPS_SAMPLES
from sklearn.metrics.cluster import normalized_mutual_info_score as nmi
from waveCluster import *

data = np.array(read_sample(FCPS_SAMPLES.SAMPLE_TWO_DIAMONDS))
tags = waveCluster(data, scale=144, threshold=-0.5, plot=True)
true_tags = np.arange(len(data)) >= 400
draw2Darray(data[:, 0], data[:, 1], tags)
draw2Darray(data[:, 0], data[:, 1], true_tags)

print(pd.Series.value_counts(tags))
# 標準化的互信息評分: normalized_mutual_info_score
print(nmi(true_tags, tags))
示例#23
0
文件: demo.py 项目: xuyaokui/scBKAP
"""

import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.metrics.cluster import adjusted_rand_score as ari
from sklearn.metrics.cluster import normalized_mutual_info_score as nmi

X = pd.read_csv('yan/yan.csv', header=None)
X = np.array(X)
X = X.transpose()

label = pd.read_csv('yan/yan_label.csv')
y = np.array(label)
label = y.ravel()

pca = PCA(n_components=2)
A = pca.fit_transform(X)

c = label.max()
kk = KMeans(n_clusters=c)
julei = kk.fit(A)
julei = julei.labels_

print('NMI value is %f \n' % nmi(julei.flatten(), label.flatten()))
print('ARI value is %f \n' % ari(julei.flatten(), label.flatten()))
print('HOM value is %f \n' % metrics.homogeneity_score(julei, label))
print('AMI value is %f \n' % metrics.adjusted_mutual_info_score(label, julei))
示例#24
0
def evaluate(result, clu_list):
    eva = nmi(clu_list, result, average_method='arithmetic')
    #    print("采用NMI评测方法,预测正确率为:%s" % eva)
    # print(nmi(clu_list, result, average_method='warn'))
    return eva
filename = address + str(noise) + '.csv'
data = []
with open(filename) as f:
    f_csv = csv.reader(f)
    for row in f_csv:
        data.append(row)
data = np.array(data).astype(float)
# finished reading, start clustering
normData = normalizeData(data)
scale = 128
dim = 2
wavelet = 'db2'
wavelength = {'db1': 0, 'db2': 1, 'bior1.3': 2}
dataDic = map2ScaleDomain(normData, scale)
dwtResult = ndWT(dataDic, 2, scale, wavelet)
threshold = getThreshold(dwtResult)
print("threshold:")
print(threshold)

#show threshold on the chart
showThreshold(dwtResult, threshold)
lineLen = scale / 2 + wavelength.get(wavelet)
result = thresholding(dwtResult, threshold, lineLen, dim)
tags = markData(normData, result, lineLen)

#show the result after clustering
draw2Darray(normData[:, 0], normData[:, 1], np.array(tags))
quality = nmi(list(normData[:, normData.shape[1] - 1]), tags)
print("AMI:")
print(quality)
示例#26
0
sca.cell_sample
sca.cell_subset
sca.cell_subset.shape
labels = sca.w.argmax(0)

from sklearn.metrics.cluster import normalized_mutual_info_score as nmi
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
labels_b = pd.read_csv('80k_cluster_numbers.csv')
labels_b = labels_b.iloc[:, 1]
labels_b = labels_b.values
labels_b_subset = labels_b[sca.cell_subset]
nmi(labels_b_subset, labels)

from sklearn.metrics.cluster import adjusted_rand_score as ari
ari(labels_b_subset, labels)

cluster_counts = np.zeros((len(set(labels_b)), len(set(labels))))
for i, j in zip(labels_b_subset, labels):
    cluster_counts[i, j] += 1

plt.figure(figsize=(10, 25))
sns.heatmap(cluster_counts / cluster_counts.sum(1)[:, np.newaxis],
            yticklabels=sorted(list(set(labels_b_subset))),
            vmin=0,
            vmax=1,
            linewidths=0.5)
plt.xlabel('UNCURL clusters')
示例#27
0
                X,
                sigma,
                K,
                W,
                bound_,
                SLK_option,
                C_init,
                bound_lambda=lmbda,
                bound_iterations=200)

        if ts:
            trivial[count] = 1
            continue

        # Evaluate the performance on validation set
        current_nmi = nmi(gnd_val, l[val_ind])
        acc, _ = get_accuracy(gnd_val, l[val_ind])

        print('lambda = ', lmbda, ' : NMI= %0.4f' % current_nmi)
        print('accuracy %0.4f' % acc)

        if current_nmi > bestnmi:
            bestnmi = current_nmi
            best_lambda_nmi = lmbda

        if acc > bestacc:
            bestacc = acc
            best_lambda_acc = lmbda
            best_C_init = C_init.copy()

        print('Best result: NMI= %0.4f' % bestnmi, '|NMI lambda = ',
示例#28
0
if __name__=="__main__":
	import sys
	import math as m
	import numpy as np
	from sklearn.metrics.cluster import normalized_mutual_info_score as nmi
	fp=open(sys.argv[1],'r')
	featLen=len(fp.readline().strip().split("\t"))-1
	fp.close()
#	dlt=sys.argv[2]
#	if dlt == "0":
	dlt="\t"
	col=int(sys.argv[2])
	train=np.loadtxt(sys.argv[1],dtype=str,delimiter=dlt,usecols=(range(featLen)))
	target=np.loadtxt(sys.argv[1],dtype=str,delimiter=dlt,usecols=(featLen,))
	mi=[]
	for i in range(featLen):
		mi.append((nmi(train[:,i],target),i))
#        sys.exit()
	print sorted(mi, reverse=True)
#	print "Relevance = ",mi
	target=train[:,col]
	mi=[]
	for i in range(0,featLen):
		if i==col:
			continue
		data=train[:,i]
		mi.append((nmi(data,target),i))
	print 'Redundance of ',sys.argv[2],sorted(mi, reverse=True)
		

if __name__ == '__main__':
    import time
    # load/subset data
    data_mat = scipy.io.loadmat('../data/10x_pooled_400.mat')
    data = data_mat['data']
    gene_subset = uncurl.max_variance_genes(data)
    data_subset = data[gene_subset, :]

    # run bnpy clustering?
    true_labels = data_mat['labels'].flatten()
    t0 = time.time()
    selected_k, labels = bnpy_select_clusters(data_subset)
    print(selected_k)
    print('nmi: ' + str(nmi(true_labels, labels)))
    print('time: ' + str(time.time() - t0))

    data_mat_2 = scipy.io.loadmat('../../uncurl_python/data/SCDE_k2_sup.mat')
    data = data_mat_2['Dat']
    t0 = time.time()
    selected_k, labels = bnpy_select_clusters(data)
    true_labels = data_mat_2['Lab'].flatten()
    print(selected_k)
    print('nmi: ' + str(nmi(true_labels, labels)))
    print('time: ' + str(time.time() - t0))

    # Zeisel 7-cluster dataset
    data_mat_3 = scipy.io.loadmat('../../uncurl_python/data/GSE60361_dat.mat')
    data = data_mat_3['Dat']
    gene_subset = uncurl.max_variance_genes(data)
示例#30
0
文件: metrics.py 项目: hsfzxjy/WRSFKM
def nmi_acc(U, labels):

    X = np.argmax(U, axis=1)

    return nmi(X, labels), acc(X, labels)
# 1. load data
data = scipy.io.loadmat('data/10x_pooled_400.mat')

data_csc = data['data']
labels = data['labels'].flatten()
gene_names = data['gene_names']

# 2. gene selection
genes = uncurl.max_variance_genes(data_csc)
data_subset = data_csc[genes,:]
gene_names_subset = gene_names[genes]

# 3. run uncurl
m, w, ll = uncurl.run_state_estimation(data_subset, 8)
print('nmi basic: ' + str(nmi(labels, w.argmax(0))))


# 4. run clustering
for metric in ['euclidean', 'cosine']:
    for n_neighbors in [10, 15, 20]:
        print('n_neighbors: ', n_neighbors, ' metric: ', metric)
        w_graph = clustering_methods.create_graph(w.T, n_neighbors=n_neighbors, metric=metric)
        clusters = clustering_methods.run_leiden(w_graph)
        print('nmi leiden: ' + str(nmi(labels, clusters)))
        clusters_louvain = clustering_methods.run_louvain(w_graph)
        print('nmi louvain: ' + str(nmi(labels, clusters_louvain)))

# 5. try running clustering w/o uncurl
clustering_result = clustering_methods.baseline_cluster(data_subset)
# TODO: figure out cuts