예제 #1
0
def measure(FC, GC, SL, TL):
    nmi_s = NMI(SL, FC)
    nmi_t = NMI(TL, GC)
    ari_s = ARI(SL, FC)
    ari_t = ARI(TL, GC)

    # print(len(set(FC)), len(set(GC)))
    pri_s = purity(FC, SL)
    pri_t = purity(GC, TL)
    ps, rs, fs = PRF1(FC, SL)
    pt, rt, ft = PRF1(GC, TL)

    perform_source = [nmi_s, ari_s, pri_s, ps, rs, fs]
    perform_target = [nmi_t, ari_t, pri_t, pt, rt, ft]
    return perform_source, perform_target
예제 #2
0
 def get_performance(self, y_true, y_pred):
     purity = self.get_purity(y_true, y_pred)
     from sklearn.metrics import normalized_mutual_info_score as NMI
     from sklearn.metrics import adjusted_rand_score as ARI
     nmi = NMI(y_true, y_pred)
     ari = ARI(y_true, y_pred)
     return purity, nmi, ari
예제 #3
0
def cluster_scores(latent_space, K, labels_true):
    labels_pred = KMeans(K).fit_predict(latent_space)
    return [
        silhouette_score(latent_space, labels_true),
        NMI(labels_true, labels_pred),
        ARI(labels_true, labels_pred)
    ]
예제 #4
0
    def test(self, embed):
        acc_scores = []
        nmi_scores = []
        ami_scores = []
        ari_scores = []
        true_labels = self.data.labels.cpu()
        for _ in range(10):
            if self.clustering == 'kmeans':
                pred = KMeans(
                    n_clusters=self.data.num_classes).fit_predict(embed)
            else:
                pred = SpectralClustering(
                    n_clusters=self.data.num_classes).fit_predict(embed)
            acc = self.accuracy(pred)
            nmi = NMI(true_labels, pred, average_method='arithmetic')
            ami = AMI(true_labels, pred, average_method='arithmetic')
            ari = ARI(true_labels, pred)
            acc_scores.append(acc)
            nmi_scores.append(nmi)
            ami_scores.append(ami)
            ari_scores.append(ari)

        print("ACC", mean(acc_scores), std(acc_scores))
        print("NMI", mean(nmi_scores), std(nmi_scores))
        print("AMI", mean(ami_scores), std(ami_scores))
        print("ARI", mean(ari_scores), std(ari_scores))
예제 #5
0
def compute_dist():
	with open(path + 'Trapnell_TCC_pairwise_distance_dge.dat','wb') as outfile:
    	pickle.dump(D, outfile)

path = '/home/zgy_ucla_cs/Research/singleCell/TCC_old_pipeline/scRNA-Clustering/Trapnell_pipeline/'

with open(path + "Trapnell_TCC_pairwise_distance_21.dat", 'rb') as f:
    D=pickle.load(f, encoding='latin1')

with open(path + "Trapnell_TCC_pairwise_distance_31.dat", 'rb') as f:
    D=pickle.load(f, encoding='latin1')    

cluster_labels = np.loadtxt(path + 'Trapnells_data/Trapnell_labels.txt',dtype=str).astype(int)-1    
num_of_clusters=3
similarity_mat= D.max()-D
labels_spectral = spectral(num_of_clusters,similarity_mat)
print(NMI(cluster_labels, labels_spectral), ARI(cluster_labels, labels_spectral))  

# ===================== scVI =====================
# expression_train, expression_test, cluster_labels, c_test = train_test_split(X, X_type, random_state=0)


batch_size = 128
learning_rate = 0.001
epsilon = 0.01
latent_dimension = 10





tf.reset_default_graph()
expression = tf.placeholder(tf.float32, (None, X.shape[1]), name='x')
kl_scalar = tf.placeholder(tf.float32, (), name='kl_scalar')
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, epsilon=epsilon)
training_phase = tf.placeholder(tf.bool, (), name='training_phase')

# getting priors
log_library_size = np.log(np.sum(X, axis=1))
mean, var = np.mean(log_library_size), np.var(log_library_size)

# loading data
model = scVI.scVIModel(expression=expression, kl_scale=kl_scalar, \
                         optimize_algo=optimizer, phase=training_phase, \
                          library_size_mean=mean, library_size_var=var, n_latent=latent_dimension)

#starting computing session
sess = tf.Session()

 # Initialize the graph and fit the training set
# this takes less than a minute on a Tesla K80
sess.run(tf.global_variables_initializer())
result = train_model(model, (X, X), sess, 250, batch_size=batch_size)

dic_full = {expression: X, training_phase:False}
latent = sess.run(model.z, feed_dict=dic_full)
# clustering_score = cluster_scores(latent, len(cell_types), cluster_labels)
clustering_score = cluster_scores(latent, np.max(cluster_labels), cluster_labels)
print("Silhouette", clustering_score[0], "\nAdjusted Rand Index", clustering_score[1], \
        "\nNormalized Mutual Information", clustering_score[2])
예제 #6
0
def evaluate(net, loader):
    """evaluates on provided data
    """

    net.eval()
    predicts = np.zeros(len(loader.dataset), dtype=np.int32)
    labels = np.zeros(len(loader.dataset), dtype=np.int32)
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(loader):
            logger.progress('processing %d/%d batch' %
                            (batch_idx, len(loader)))
            inputs = inputs.to(cfg.device, non_blocking=True)
            # assuming the last head is the main one
            # output dimension of the last head
            # should be consistent with the ground-truth
            logits = net(inputs)[-1]
            start = batch_idx * loader.batch_size
            end = start + loader.batch_size
            end = min(end, len(loader.dataset))
            labels[start:end] = targets.cpu().numpy()
            predicts[start:end] = logits.max(1)[1].cpu().numpy()

    # compute accuracy
    num_classes = labels.max().item() + 1
    count_matrix = np.zeros((num_classes, num_classes), dtype=np.int32)
    for i in xrange(predicts.shape[0]):
        count_matrix[predicts[i], labels[i]] += 1
    reassignment = np.dstack(
        linear_sum_assignment(count_matrix.max() - count_matrix))[0]
    acc = count_matrix[reassignment[:, 0], reassignment[:, 1]].sum().astype(
        np.float32) / predicts.shape[0]
    return acc, NMI(labels, predicts), ARI(labels, predicts)
예제 #7
0
    def clustering_scores(self,
                          name,
                          verbose=True,
                          prediction_algorithm='knn'):
        if self.gene_dataset.n_labels > 1:
            latent, _, labels = get_latent(self.model, self.data_loaders[name])
            if prediction_algorithm == 'knn':
                labels_pred = KMeans(self.gene_dataset.n_labels,
                                     n_init=200).fit_predict(
                                         latent)  # n_jobs>1 ?
            elif prediction_algorithm == 'gmm':
                gmm = GMM(self.gene_dataset.n_labels)
                gmm.fit(latent)
                labels_pred = gmm.predict(latent)

            asw_score = silhouette_score(latent, labels)
            nmi_score = NMI(labels, labels_pred)
            ari_score = ARI(labels, labels_pred)
            uca_score = unsupervised_clustering_accuracy(labels,
                                                         labels_pred)[0]
            if verbose:
                print(
                    "Clustering Scores for %s:\nSilhouette: %.4f\nNMI: %.4f\nARI: %.4f\nUCA: %.4f"
                    % (name, asw_score, nmi_score, ari_score, uca_score))
            return asw_score, nmi_score, ari_score, uca_score
예제 #8
0
def clustering_scores(n_labels, labels, latent, prediction_algorithm="knn"):
    if n_labels > 1:
        if prediction_algorithm == "knn":
            labels_pred = KMeans(n_labels,
                                 n_init=200).fit_predict(latent)  # n_jobs>1 ?
        elif prediction_algorithm == "gmm":
            gmm = GMM(n_labels)
            gmm.fit(latent)
            labels_pred = gmm.predict(latent)

        ari_score = ARI(labels, labels_pred)
        return ari_score
예제 #9
0
def analysis():
    df_gan = pd.read_csv('result/pbmc_two_batch-cluster_result.csv',
                         delimiter=',')
    df_lsi = pd.read_csv('result/pbmc_two_batch-LSI-cluster_result.csv',
                         delimiter=',')

    labels = df_lsi['predicted label'].values
    labels_pred = df_gan['predicted label'].values

    nmi_score = NMI(labels, labels_pred)
    ari_score = ARI(labels, labels_pred)
    print("Clustering Scores:\nNMI: %.4f\nARI: %.4f\n" %
          (nmi_score, ari_score))
예제 #10
0
def clustering():
    str_input = [i.strip() for i in open('./ts2str', 'r').readlines()]
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95,
                                       min_df=2,
                                       analyzer='char',
                                       ngram_range=(2, 5))
    str2tfidf = tfidf_vectorizer.fit_transform(str_input)
    #print tfidf_vectorizer.get_feature_names()
    sc = SpectralClustering(
        n_clusters=113,
        eigen_solver='arpack',
        affinity="nearest_neighbors",
        #assign_labels="discretize"
    )
    y_pred = sc.fit_predict(str2tfidf)
    #y_true = [int(i.strip()) for i in open('./ts_cluster','r').readlines()]
    y_true = [i.strip() for i in open('./ts_type', 'r').readlines()]
    y_true = y_true[1:]
    print ARI(y_true, y_pred)
    y_shuffle = list(y_true)
    random.shuffle(y_shuffle)
    print ARI(y_true, y_shuffle)
예제 #11
0
 def clustering_scores(self, name, verbose=True):
     if self.gene_dataset.n_labels > 1:
         latent, _, labels = get_latent(self.model, self.data_loaders[name])
         labels_pred = KMeans(self.gene_dataset.n_labels,
                              n_init=200).fit_predict(latent)  # n_jobs>1 ?
         asw_score = silhouette_score(latent, labels)
         nmi_score = NMI(labels, labels_pred)
         ari_score = ARI(labels, labels_pred)
         if verbose:
             print(
                 "Clustering Scores for %s:\nSilhouette: %.4f\nNMI: %.4f\nARI: %.4f"
                 % (name, asw_score, nmi_score, ari_score))
         return asw_score, nmi_score, ari_score
예제 #12
0
def get_performance(y_true, y_pred, n_cluster):
    """
    获取当前轮次的评估指标
    :param y_true:
    :param y_pred:
    :param n_cluster:
    :return:
    """
    purity = get_purity(y_true, y_pred, n_cluster)
    from sklearn.metrics import normalized_mutual_info_score as NMI
    from sklearn.metrics import adjusted_rand_score as ARI
    nmi = NMI(y_true, y_pred)
    ari = ARI(y_true, y_pred)
    return purity, nmi, ari
예제 #13
0
    def score(self, z, true_labels):
        assert self.labels is not None, "Cannot compute clustering scores before fitting the data."

        self.silhouette_score = silhouette_score(z, self.labels)
        self.nmi = NMI(
            true_labels, self.labels,
            average_method="geometric")  # Same average as original paper
        self.ari = ARI(true_labels, self.labels)

        true_k = len(np.unique(true_labels))
        if self.k == true_k:
            self.accuracy = accuracy(true_labels, self.labels)
        else:
            print(
                "Fitted number of labels ({}) is not equal to given true number of labels ({}). Cannot "
                "compute accuracy.".format(self.k, true_k))
예제 #14
0
파일: wsi.py 프로젝트: nvanva/BOS_AggloSil
def clusterize_search( word, vecs, gold_sense_ids = None ,ncs=list(range(1, 5, 1)) + list(range(5, 12, 2)),
            affinities=('cosine',), linkages=('average',)):
    if linkages is None:
        linkages = sklearn.cluster.hierarchical._TREE_BUILDERS.keys()
    if affinities is None:
        affinities = ('cosine', 'euclidean', 'manhattan')
    sdfs = []
    mem = Memory('maxari_cache', verbose=0)

    zero_vecs = ((vecs ** 2).sum(axis=-1) == 0)
    if zero_vecs.sum() > 0:
        vecs = np.concatenate((vecs, zero_vecs[:, np.newaxis].astype(vecs.dtype)), axis=-1)

    best_clids = None
    best_silhouette = 0
    distances = []

    for affinity in affinities:
        distance_matrix = cdist(vecs, vecs, metric=affinity)
        distances.append(distance_matrix)
        for nc in ncs:
            for linkage in linkages:
                if linkage == 'ward' and affinity != 'euclidean':
                    continue
                clr = AgglomerativeClustering(affinity='precomputed', linkage=linkage, n_clusters=nc, memory=mem)
                clids = clr.fit_predict(distance_matrix) if nc > 1 else np.zeros(len(vecs))

                ari = ARI(gold_sense_ids, clids) if gold_sense_ids is not None else np.nan
                sil_cosine = -1. if len(np.unique(clids)) < 2 else silhouette_score(vecs, clids,metric='cosine')
                sil_euclidean = -1. if len(np.unique(clids)) < 2 else silhouette_score(vecs, clids, metric='euclidean')
                vc = '' if gold_sense_ids is None else '/'.join(
                                        np.sort(pd.value_counts(gold_sense_ids).values)[::-1].astype(str))
                if sil_cosine > best_silhouette:
                    best_silhouette = sil_cosine
                    best_clids = clids

                sdf = pd.DataFrame({'ari': ari,
                                    'word': word, 'nc': nc,
                                    'sil_cosine': sil_cosine,
                                    'sil_euclidean': sil_euclidean,
                                    'vc': vc,
                                    'affinity': affinity, 'linkage': linkage}, index=[0])

                sdfs.append(sdf)

    sdf = pd.concat(sdfs, ignore_index=True)
    return best_clids, sdf, None, distances
예제 #15
0
    def clustering_scores(self, prediction_algorithm="knn"):
        if self.gene_dataset.n_labels > 1:
            latent, _, labels = self.get_latent()
            if prediction_algorithm == "knn":
                labels_pred = KMeans(self.gene_dataset.n_labels, n_init=200).fit_predict(latent)  # n_jobs>1 ?
            elif prediction_algorithm == "gmm":
                gmm = GMM(self.gene_dataset.n_labels)
                gmm.fit(latent)
                labels_pred = gmm.predict(latent)

            asw_score = silhouette_score(latent, labels)
            nmi_score = NMI(labels, labels_pred)
            ari_score = ARI(labels, labels_pred)
            uca_score = unsupervised_clustering_accuracy(labels, labels_pred)[0]
            logger.debug("Clustering Scores:\nSilhouette: %.4f\nNMI: %.4f\nARI: %.4f\nUCA: %.4f" %
                         (asw_score, nmi_score, ari_score, uca_score))
            return asw_score, nmi_score, ari_score, uca_score
예제 #16
0
def main():
    cwd = os.getcwd() + '/' + sys.argv[0].replace('test.py', '')
    sys.stdout.write("Testing... this may take a few minutes.\n")
    sys.stdout.flush()
    process = Popen("CHIMERA -i "+ cwd+"/test_data.csv -r "+ cwd+"/output.txt " +\
             "-k 2 -m 20 -N 3 -e 0.01", shell=True)
    process.communicate()

    with open(cwd + '/output.txt') as f:
        out_label = numpy.asarray(list(csv.reader(f, delimiter='\t')))
    idx = numpy.nonzero(out_label[0] == "Cluster")[0]
    out_label = out_label[1:, idx].flatten().astype(numpy.int)

    true_label = numpy.append(numpy.ones(250), numpy.ones(250) * 2)

    measure = ARI(true_label, out_label)
    sys.stdout.write("Test Complete, output labels in test/ folder.\n")
    sys.stdout.write(
        "Clustering test samples yields an adjusted rand index of %.3f with ground truth labels.\n"
        % measure)
    if measure >= 0.9: sys.stdout.write("Test is successful.\n")
예제 #17
0
def clustering_scores(X, y, prediction_algorithm='knn'):
    from sklearn.metrics import adjusted_rand_score as ARI
    from sklearn.metrics import normalized_mutual_info_score as NMI
    from sklearn.metrics import silhouette_score
    from sklearn.mixture import GaussianMixture as GMM
    from sklearn.cluster import KMeans

    cluster_num = np.unique(y).shape[0]
    if prediction_algorithm == 'knn':
        labels_pred = KMeans(cluster_num, n_init=200).fit_predict(X)
    elif prediction_algorithm == 'gmm':
        gmm = GMM(cluster_num)
        gmm.fit(X)
        labels_pred = gmm.predict(X)
    labels = y
    asw_score = silhouette_score(X, labels)
    nmi_score = NMI(labels, labels_pred)
    ari_score = ARI(labels, labels_pred)
    labels_int = convert_label_to_int(labels)
    uca_score = unsupervised_clustering_accuracy(labels_int, labels_pred)[0]
    return asw_score, nmi_score, ari_score, uca_score
def test_casc_cca():

    n = [10, 10]
    p = [[0.8, 0.2], [0.2, 0.8]]
    np.random.seed(105)
    A = sbm(n=n, p=p)
    covarites = np.array(
        [
            [1.0, 0.0],
            [1.0, 1.0],
            [1.0, 0.0],
            [1.0, 0.0],
            [1.0, 0.0],
            [1.0, 0.0],
            [1.0, 0.0],
            [1.0, 0.0],
            [0.0, 0.0],
            [0.0, 0.0],
            [0.0, 1.0],
            [0.0, 1.0],
            [0.0, 1.0],
            [0.0, 0.0],
            [0.0, 0.0],
            [0.0, 1.0],
            [1.0, 1.0],
            [1.0, 1.0],
            [1.0, 0.0],
            [0.0, 1.0],
        ]
    )
    casc = CovariateAssistedSpectralEmbed(
        n_components=2, assortative=True, cca=True, check_lcc=False
    )
    casc_results = casc.fit_predict(np.array(A), covarites, y=None, return_full=False)
    ans = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    ResultARI = ARI(casc_results, ans)

    assert ResultARI == 1
예제 #19
0
# ============================== spectral ==============================

# Load Z TCC 31
path = '/home/zgy_ucla_cs/Research/singleCell/TCC_old_pipeline/scRNA-Clustering/Zeisel_pipeline/mat_31/'
with open(path + "pwise_dist_L1.dat", 'rb') as f:
    D = pickle.load(f)  #, encoding='latin1')

path = '/home/zgy_ucla_cs/Research/singleCell/scRNA-Seq-TCC-prep/Zeisel/'
with open(path + "pwise_dist_l1.dat", 'rb') as f:
    D_l1 = pickle.load(f)  #, encoding='latin1')

D = D_l1
num_of_clusters = 9
similarity_mat = D.max() - D
labels_spectral = spectral(num_of_clusters, similarity_mat)
print(NMI(cluster_labels, labels_spectral), ARI(cluster_labels,
                                                labels_spectral))

# pwise_dist_latent = latent_dist(X, 10)
# similarity_mat=pwise_dist_latent.max()-pwise_dist_latent
# labels_spectral = spectral(num_of_clusters,similarity_mat)
# print(NMI(cluster_labels, labels_spectral), ARI(cluster_labels, labels_spectral))

# ===================== scVI =====================
# expression_train, expression_test, c_train, c_test = train_test_split(X, X_type, random_state=0)
expression_train, expression_test, c_train, c_test = train_test_split(
    expression_data, X_type, random_state=0)

log_library_size = np.log(np.sum(expression_train, axis=1))
mean, var = np.mean(log_library_size), np.var(log_library_size)

batch_size = 128
예제 #20
0
accm = Accumulator('model ll', 'oracle ll', 'ARI', 'NMI', 'k-MAE', 'et')
for dataset in tqdm(benchmark):
    true_labels = to_numpy(dataset['labels'].argmax(-1))
    X = to_numpy(dataset['X'])
    ll = 0
    ari = 0
    nmi = 0
    mae = 0
    et = 0
    for b in range(len(X)):
        tick = time.time()
        vbmog.run(X[b], verbose=False)
        et += time.time() - tick
        ll += vbmog.loglikel(X[b])
        labels = vbmog.labels()
        ari += ARI(true_labels[b], labels)
        nmi += NMI(true_labels[b], labels, average_method='arithmetic')
        mae += abs(len(np.unique(true_labels[b])) - len(np.unique(labels)))

    ll /= len(X)
    ari /= len(X)
    nmi /= len(X)
    mae /= len(X)
    et /= len(X)

    accm.update([ll.item(), dataset['ll'], ari, nmi, mae, et])

save_dir = os.path.join(results_path, 'baselines', 'vbmog')
if not os.path.isdir(save_dir):
    os.makedirs(save_dir)
logger = get_logger('vbmog_baseline', os.path.join(save_dir, args.filename))
예제 #21
0
                           random_state=random_state).fit_predict(dataset)

    #ejecucion de HAC averange linkage
    labels_HAC_averange = AgglomerativeClustering(
        n_clusters=K, linkage="average").fit_predict(dataset)

    #ejecucion de HAC single linkage
    links = linkage(dataset, "single")
    labels_HAC_single = fcluster(links, K, criterion="maxclust")

    #ejecucion de HAC complete linkage
    labels_HAC_complete = AgglomerativeClustering(
        n_clusters=K, linkage="complete").fit_predict(dataset)

    #calculo de la metrica ARI para los algoritmos de clustering
    result_ARI[j][0] = ARI(labels_true, labels_kmeans)
    result_ARI[j][1] = ARI(labels_true, labels_HAC_averange)
    result_ARI[j][2] = ARI(labels_true, labels_HAC_single)
    result_ARI[j][3] = ARI(labels_true, labels_HAC_complete)

    #validacion de las dimenciones del dataset para saber si se crearan los scatter plots de los algoritmos
    if (D == 2):

        #creamos la figura para los scatterplots
        figure = plot.figure(figsize=(12, 18))

        #creamos el scatter plot del Ground Truth
        plot.subplot(321)
        plot.scatter(dataset[:, 0], dataset[:, 1], c=labels_true, linewidth=1)
        plot.title("Ground Truth", fontsize=18, fontweight="bold")
예제 #22
0
def clustering_metrics(labels_pred, labels_true):
    return {"NMI":NMI(labels_true, labels_pred), "ARI":ARI(labels_true, labels_pred),\
            "F1":f1_score(labels_true, labels_pred, average='weighted')}
예제 #23
0
        accPRcut = (accPRcut + 0.0) / (s0 * s1)

        arr_scale.append(scale)
        arr_processtime.append(process_time)
        arr_Rcut_time.append(Rcut_time)
        arr_accRcut.append(accRcut)
        arr_PRcut_time.append(PRcut_time)
        arr_accPRcut.append(accPRcut)

        print '***********************************'
        print '                   Scale = ', img.shape
        print 'Constructing graph time  = ', process_time
        print 'Ratio cut          time  = ', Rcut_time
        print '               Accuracy  = ', AMI(img_gt.flatten(),
                                                 labelsRcut), ARI(
                                                     img_gt.flatten(),
                                                     labelsRcut), accRcut
        print 'PowerRatio cut     time  = ', PRcut_time
        print '               Accuracy  = ', AMI(img_gt.flatten(),
                                                 labelsPRcut), ARI(
                                                     img_gt.flatten(),
                                                     labelsPRcut), accPRcut
        print '***********************************'

# Writing the results to a csv file
f = open('results_8c.csv', "w+")
for i in range(len(arr_scale)):
    l = ""
    l += str(arr_scale[i]) + "," + str(arr_processtime[i]) + ","
    l += str(arr_Rcut_time[i]) + "," + str(arr_accRcut[i]) + "," + str(
        AMI(img_gt.flatten(), labelsRcut)) + "," + str(
예제 #24
0
    print("The number of dimensions is: " + str(data.shape[1]))
    return data


if __name__ == "__main__":
    from FlowGrid import *
    import numpy as np
    from time import time
    import argparse
    import os
    file, bin_n, MinDenC, eps, output, label_file = setting_arg()
    data = check_file_valid(file)
    t1 = time()
    if MinDenC:
        fg = FlowGrid(data, bin_n=bin_n, eps=eps, MinDenC=MinDenC)
    else:
        fg = FlowGrid(data, bin_n=bin_n, eps=eps)
    label = fg.clustering()
    print("runing time: " + str(round(time() - t1, 3)))
    if output:
        np.savetxt(output, label, delimiter=',', fmt="%d")
    else:
        np.savetxt(file[:-4] + "_FlowGrid_label.csv",
                   label,
                   delimiter=',',
                   fmt="%d")
    if label_file:
        from sklearn.metrics import adjusted_rand_score as ARI
        true_label = np.genfromtxt(label_file, delimiter=',', skip_header=1)
        print("ARI:" + str(round(ARI(true_label, label), 4)))
예제 #25
0
    m = 1.2
    n_runs = 5

    for adataset in range(len(datasets)):
        print('Dataset: ', datasets[adataset])

        tmp = np.load('datasets/'+datasets[adataset])
        X = tmp['X']
        y = tmp['y']
        k = len(np.unique(y))
        print('Data size =', X.shape)
        print('# clusters =', k)
        
        for i_runs in range(n_runs):
            u, w, centers, cost, svals, BIC, selected_idx = select_s(
                X, m=m, num_s=n_s, n_clusters=k, max_iter=max_iter, n_init=n_init, tol=tol, n_jobs=10
            )
    
            #print('svals', svals)
            #print('selected s:', svals[selected_idx])
            #print('w:', w[selected_idx])
            #print('cost:', cost[selected_idx])
            #print('BIC:', BIC)
            #print(BIC.argmax())
            from sklearn.metrics import adjusted_rand_score as ARI
            ari = ARI(y, u[selected_idx].argmax(axis=0))
            print('ARI =', ari)

        #break
        
예제 #26
0
def evaluate(net, loader, writer, epoch):
    """evaluates on provided data
    """

    net.eval()
    predicts = np.zeros(len(loader.dataset), dtype=np.int32)
    labels = np.zeros(len(loader.dataset), dtype=np.int32)
    intermediates = np.zeros((len(loader.dataset), 2048), dtype=np.float32)
    images = np.zeros((len(loader.dataset), 3, 64, 64), dtype=np.float32)

    print(f"Evaluating on {len(loader.dataset)} samples")

    with torch.no_grad():
        for batch_idx, (batch, targets) in enumerate(loader):
            # logger.progress('processing %d/%d batch' % (batch_idx, len(loader)))
            batch = batch.to(cfg.device, non_blocking=True)
            # assuming the last head is the main one
            # output dimension of the last head
            # should be consistent with the ground-truth
            logits = net(batch, -1)
            start = batch_idx * loader.batch_size
            end = start + loader.batch_size
            end = min(end, len(loader.dataset))
            labels[start:end] = targets.cpu().numpy()
            predicts[start:end] = logits.max(1)[1].cpu().numpy()

            if epoch % cfg.embedding_freq == 0:
                intermediates[start:end] = net(batch, -1, True).cpu().numpy()
                if not cfg.tfm_adaptive_thresholding:
                    for i in range(3):
                        batch[:, i] = (batch[:, i] *
                                       cfg.tfm_stds[i]) + cfg.tfm_means[i]
                images[start:end] = torch.nn.functional.interpolate(
                    batch, size=(64, 64), mode='bicubic',
                    align_corners=False).cpu().numpy()

    # TODO: Gather labels and predicts
    # compute accuracy
    num_classes = labels.max().item() + 1
    count_matrix = np.zeros((num_classes, num_classes), dtype=np.int32)
    for i in range(predicts.shape[0]):
        count_matrix[predicts[i], labels[i]] += 1
    reassignment = np.dstack(
        linear_sum_assignment(count_matrix.max() - count_matrix))[0]
    acc = count_matrix[reassignment[:, 0], reassignment[:, 1]].sum().astype(
        np.float32) / predicts.shape[0]
    nmi = NMI(labels, predicts)
    ari = ARI(labels, predicts)

    # compute f1 scores per class
    predicts_reassigned = reassignment[predicts, 1]
    precision = precision_score(labels,
                                predicts_reassigned,
                                average=None,
                                zero_division=0)
    recall = recall_score(labels,
                          predicts_reassigned,
                          average=None,
                          zero_division=0)
    f1 = f1_score(labels, predicts_reassigned, average=None, zero_division=0)

    logger.info('Evaluation results at epoch %d are: '
                'ACC: %.3f, NMI: %.3f, ARI: %.3f' % (epoch, acc, nmi, ari))
    if cfg.local_rank == 0:
        writer.add_scalar('Evaluate/ACC', acc, epoch)
        writer.add_scalar('Evaluate/NMI', nmi, epoch)
        writer.add_scalar('Evaluate/ARI', ari, epoch)

        for i in range(len(f1)):
            writer.add_scalar(f'Evaluate/f1_{i}', f1[i], epoch)
            writer.add_scalar(f'Evaluate/precision_{i}', precision[i], epoch)
            writer.add_scalar(f'Evaluate/recall_{i}', recall[i], epoch)

        if epoch % cfg.embedding_freq == 0 and cfg.embedding_freq != -1:
            writer.add_embedding(intermediates, labels, images, epoch,
                                 cfg.session)

    return acc
예제 #27
0
파일: run_spectral.py 프로젝트: mlzxy/dac
args, _ = parser.parse_known_args()
print(str(args))

benchmark = torch.load(os.path.join(benchmarks_path, args.benchmarkfile))
accm = Accumulator('ari', 'nmi', 'et')
for batch in tqdm(benchmark):
    B = batch['X'].shape[0]
    for b in range(B):
        X = to_numpy(batch['X'][b])
        true_labels = to_numpy(batch['labels'][b].argmax(-1))
        true_K = len(np.unique(true_labels))

        tick = time.time()
        spec = SpectralClustering(n_clusters=true_K,
                                  affinity='nearest_neighbors',
                                  n_neighbors=10).fit(X)
        labels = spec.labels_

        accm.update([
            ARI(true_labels, labels),
            NMI(true_labels, labels, average_method='arithmetic'),
            time.time() - tick
        ])

save_dir = os.path.join(results_path, 'baselines', 'mmaf_spectral')
if not os.path.isdir(save_dir):
    os.makedirs(save_dir)
logger = get_logger('spectral_baseline', os.path.join(save_dir, args.filename))
logger.info(accm.info())
예제 #28
0
        plt.yticks(())
        plt.text(.99,
                 .01, ('Comp time '
                       '%.2fs' % (t1 - t0)).lstrip('0'),
                 transform=plt.gca().transAxes,
                 size=10,
                 horizontalalignment='right')
        plt.text(.99,
                 .06, ('Purity score '
                       '%.3f' % (homogeneity_score(y, y_pred))).lstrip('0'),
                 transform=plt.gca().transAxes,
                 size=10,
                 horizontalalignment='right')
        plt.text(.99,
                 .11, ('Rand Index '
                       '%.3f' % (ARI(y, y_pred))).lstrip('0'),
                 transform=plt.gca().transAxes,
                 size=10,
                 horizontalalignment='right')
        plt.text(
            .99,
            .16,
            ('Silhouette score '
             '%.3f' %
             (silhouette(X, y_pred, metric='euclidean',
                         sample_size=n_samples))).lstrip('0'),
            transform=plt.gca().transAxes,
            size=10,
            horizontalalignment='right')
        plot1_num += 1
예제 #29
0
def clustering(dataFile, outFile, config):
    """Core function of CHIMERA, performs:
        1) read and preprocess data
        2) clustering
        3) save results
    """
    #================================= Reading Data ======================================================
    sys.stdout.write('\treading data...\n')
    feat_cov = None
    feat_set = None
    ID = None
    with open(dataFile) as f:
        data = list(csv.reader(f))
        header = np.asarray(data[0])
        if 'Group' not in header:
            sys.stdout.write(
                'Error: group information not found. Please check csv header line for field "Group".\n'
            )
            sys.exit(1)
        if 'IMG' not in header:
            sys.stdout.write(
                'Error: image features not found. Please check csv header line for field "IMG".\n'
            )
            sys.exit(1)
        data = np.asarray(data[1:])

        group = (data[:, np.nonzero(header == 'Group')[0]].flatten()).astype(
            np.int8)
        feat_img = (data[:, np.nonzero(header == 'IMG')[0]]).astype(np.float)
        if 'COVAR' in header:
            feat_cov = (data[:, np.nonzero(header == 'COVAR')[0]]).astype(
                np.float)
        if 'ID' in header:
            ID = data[:, np.nonzero(header == 'ID')[0]]
            ID = ID[group == 1]
        if 'Set' in header:
            feat_set = data[:, np.nonzero(header == 'Set')[0]].flatten()

    #================================= Normalizing Data ======================================================
    if config['norm'] != 0:
        model, feat_img, feat_cov = data_normalization(feat_img, feat_cov,
                                                       config)

    #================================= Prepare Dataset ID ======================================================
    if feat_set is None:
        config['rs'] = 0
    else:
        unique_ID = np.unique(feat_set)
        datasetID = np.copy(feat_set)
        feat_set = np.zeros((len(datasetID), len(unique_ID)))
        for i in range(len(unique_ID)):
            feat_set[np.nonzero(datasetID == unique_ID[i])[0], i] = 1

    #================================= Calculate auto weight ==================================================
    if feat_cov is None:
        config['r'] = 0
    else:
        if config['r'] == -1.0:
            config['r'] = np.sum(np.var(feat_cov, axis=0)) / np.sum(
                np.var(feat_img, axis=0))

    #================================= Verbose information ==================================================
    if config['verbose']:
        sys.stdout.write(
            '\t\t================= data summary ==================\n')
        sys.stdout.write('\t\tnumber of patients: %d\n' % sum(group == 1))
        sys.stdout.write('\t\tnumber of normal controls: %d\n' %
                         sum(group == 0))
        sys.stdout.write('\t\timaging feature dimension: %d\n' %
                         feat_img.shape[1])
        if feat_cov is not None:
            sys.stdout.write('\t\tcovariates dimension: %d\n' %
                             feat_cov.shape[1])
        if feat_set is not None:
            sys.stdout.write('\t\tunique data set id: %d\n' % len(unique_ID))
        sys.stdout.write(
            '\t\t================ configurations =================\n')
        sys.stdout.write('\t\tnumber of clusters: %d\n' % config['K'])
        sys.stdout.write('\t\tnumber of runs: %d\n' % config['numRun'])
        sys.stdout.write('\t\tmax number of iterations: %d\n' %
                         config['max_iter'])
        sys.stdout.write('\t\tdistance ratio covar/img = %.4f\n' % config['r'])
        sys.stdout.write('\t\tdistance ratio set/img = %.4f\n' % config['rs'])
        sys.stdout.write('\t\tlambda1 = %.2f\tlambda2 = %.2f\n' %
                         (config['lambda1'], config['lambda2']))
        sys.stdout.write('\t\ttransformation chosen: %s\n' %
                         config['transform'])
        sys.stdout.write(
            '\t\t=================================================\n')

    #============================ Preparing Data ======================================================
    # separate data into patient and normal groups
    feat_img = np.transpose(feat_img)
    x = feat_img[:, group == 0]  # normal controls
    y = feat_img[:, group == 1]  # patients
    xd = []
    yd = []
    xs = []
    ys = []
    if feat_cov is not None:
        feat_cov = np.transpose(feat_cov)
        xd = feat_cov[:, group == 0]
        yd = feat_cov[:, group == 1]
    if feat_set is not None:
        feat_set = np.transpose(feat_set)
        xs = feat_set[:, group == 0]
        ys = feat_set[:, group == 1]

    #================================Perform Clustering (2 modes available)=================================
    sys.stdout.write('\tclustering...\n')
    if config['mode'] == 2:  #save result yields minimal energy
        obj = np.float('inf')
        for i in range(config['numRun']):
            cur_result = optimize(x, xd, xs, y, yd, ys, config)
            cur_obj = cur_result[2].min()
            if config['verbose']:
                sys.stdout.write('\t\tRun id %d, obj = %f\n' % (i, cur_obj))
            else:
                time_bar(i, config['numRun'])
            if cur_obj < obj:
                result = cur_result
                obj = cur_obj
        sys.stdout.write('\n')
        membership = np.dot(result[1], Tr(result[0]['delta']))
        label = np.argmax(membership, axis=1)
    else:  # save result most reproducible
        label_mat = []
        results = []
        for i in range(config['numRun']):
            cur_result = optimize(x, xd, xs, y, yd, ys, config)
            membership = np.dot(cur_result[1], Tr(cur_result[0]['delta']))
            label = np.argmax(membership, axis=1)
            label_mat.append(label)
            results.append(cur_result)
            time_bar(i, config['numRun'])
        sys.stdout.write('\n')
        label_mat = np.asarray(label_mat)
        ari_mat = np.zeros((config['numRun'], config['numRun']))
        for i in range(config['numRun']):
            for j in range(i + 1, config['numRun']):
                ari_mat[i, j] = ARI(label_mat[i, :], label_mat[j, :])
                ari_mat[j, i] = ari_mat[i, j]
        ave_ari = np.sum(ari_mat, axis=0) / (config['numRun'] - 1)
        idx = np.argmax(ave_ari)
        if config['verbose']:
            sys.stdout.write('\t\tBest average ARI is %f\n' % (max(ave_ari)))
        label = label_mat[idx, :]
        result = results[idx]

    #================================ Finalizing and Save =====================================
    sys.stdout.write('\tsaving results...\n')
    with open(outFile, 'w') as f:
        if ID is None:
            f.write('Cluster\n')
            for i in range(len(label)):
                f.write('%d\n' % (label[i] + 1))
        else:
            f.write('ID,Cluster\n')
            for i in range(len(label)):
                f.write('%s,%d\n' % (ID[i][0], label[i] + 1))
    if config['modelFile'] != "":
        trainData = {'x': x, 'xd': xd, 'xs': xs, 'datasetID': unique_ID}
        model.update({'trainData': trainData})
        model.update({'model': result})
        model.update({'config': config})
        with open(config['modelFile'], 'wb') as f:
            cPickle.dump(model, f, 2)
예제 #30
0
def clustering(Xsvd,
               cells,
               dataset,
               suffix,
               labels=None,
               tlabels=None,
               method='knn',
               istsne=True,
               name='',
               batch_labels=None,
               seed=42):
    tsne = TSNE(n_jobs=24).fit_transform(Xsvd)

    for n_components in [15]:
        if method == 'gmm':
            clf = mixture.GaussianMixture(n_components=n_components).fit(mat)
            labels_pred = clf.predict(tsne)
        elif method == 'knn':
            labels_pred = KMeans(n_components,
                                 n_init=200).fit_predict(tsne)  # n_jobs>1 ?
        elif method == 'dbscan':
            labels_pred = DBSCAN(eps=0.3, min_samples=10).fit(tsne).labels_
        elif method == 'spectral':
            spectral = cluster.SpectralClustering(n_clusters=n_components,
                                                  eigen_solver='arpack',
                                                  affinity="nearest_neighbors")
            labels_pred = spectral.fit_predict(tsne)
        elif method == 'louvain':
            from scipy.spatial import distance

            for louvain in [30]:
                print('****', louvain)
                mat = kneighbors_graph(Xsvd,
                                       louvain,
                                       mode='distance',
                                       include_self=True).todense()

                G = nx.from_numpy_matrix(mat)
                partition = community.best_partition(G, random_state=seed)

                labels_pred = []
                for i in range(mat.shape[0]):
                    labels_pred.append(partition[i])

                labels_pred = np.array(labels_pred)
                print('louvain', louvain, tsne[:5], len(labels),
                      len(labels_pred))
                #print(np.unique(labels_pred))

                if labels is not None:
                    nmi_score = NMI(labels, labels_pred)
                    ari_score = ARI(labels, labels_pred)
                    print(
                        n_components, method,
                        "Clustering Scores:\nNMI: %.4f\nARI: %.4f\n" %
                        (nmi_score, ari_score))

    if istsne:
        n_components = len(np.unique(labels_pred))
        vis_x = tsne[:, 0]
        vis_y = tsne[:, 1]
        colors = [
            'blue', 'orange', 'green', 'red', 'purple', 'brown', 'pink',
            'yellow', 'black', 'teal', 'plum', 'tan', 'bisque', 'beige',
            'slategray', 'brown', 'darkred', 'salmon', 'coral', 'olive',
            'lightpink', 'teal', 'darkcyan', 'BlueViolet', 'CornflowerBlue',
            'DarkKhaki', 'DarkTurquoise'
        ]

        show_tsne(tsne,
                  labels,
                  'result/%s/%s-%s-LSI-true.png' % (dataset, name, suffix),
                  tlabels=tlabels)
        show_tsne(tsne, labels_pred,
                  'result/%s/%s-%s-LSI-pred.png' % (dataset, name, suffix))

        with open('result/%s-LSI-cluster_result.csv' % (dataset), 'w') as f:
            f.write('cell,predicted label,tsne-1,tsne-2\n')
            for cell, pred, t in zip(cells, labels_pred, tsne):
                f.write('%s,%d,%f,%f\n' % (cell, pred, t[0], t[1]))

    if batch_labels is not None:
        show_tsne(
            tsne, batch_labels, 'result/%s/%s-GMVAE-%s-%s-batch.png' %
            (dataset, dataset, suffix, name))