示例#1
0
def test_kmeans():
    """Test implementation of Kmeans."""
    X_train, y_train = load_basic_motions(split="train")
    X_test, y_test = load_basic_motions(split="test")

    kmeans = TimeSeriesKMeans(
        averaging_method="mean",
        random_state=1,
        n_init=2,
        n_clusters=4,
        init_algorithm="kmeans++",
        metric="dtw",
    )
    train_predict = kmeans.fit_predict(X_train)
    train_mean_score = metrics.rand_score(y_train, train_predict)

    test_mean_result = kmeans.predict(X_test)
    mean_score = metrics.rand_score(y_test, test_mean_result)
    proba = kmeans.predict_proba(X_test)

    assert np.array_equal(test_mean_result, expected_results["mean"])
    assert mean_score == expected_score["mean"]
    assert train_mean_score == expected_train_result["mean"]
    assert kmeans.n_iter_ == expected_iters["mean"]
    assert np.array_equal(kmeans.labels_, expected_labels["mean"])
    assert isinstance(kmeans.cluster_centers_, np.ndarray)
    assert proba.shape == (40, 4)

    for val in proba:
        assert np.count_nonzero(val == 1.0) == 1
示例#2
0
def test_kmedoids():
    """Test implementation of Kmedoids."""
    X_train, y_train = load_basic_motions(split="train")
    X_test, y_test = load_basic_motions(split="test")

    kmedoids = TimeSeriesKMedoids(
        random_state=1,
        n_init=2,
        max_iter=5,
        init_algorithm="kmeans++",
        metric="euclidean",
    )
    train_predict = kmedoids.fit_predict(X_train)
    train_score = metrics.rand_score(y_train, train_predict)
    test_medoids_result = kmedoids.predict(X_test)
    medoids_score = metrics.rand_score(y_test, test_medoids_result)
    proba = kmedoids.predict_proba(X_test)
    assert np.array_equal(test_medoids_result, expected_results["medoids"])
    assert medoids_score == expected_score["medoids"]
    assert train_score == train_expected_score["medoids"]
    assert np.isclose(kmedoids.inertia_, expected_inertia["medoids"])
    assert kmedoids.n_iter_ == expected_iters["medoids"]
    assert np.array_equal(kmedoids.labels_, expected_labels["medoids"])
    assert isinstance(kmedoids.cluster_centers_, np.ndarray)
    assert proba.shape == (40, 8)

    for val in proba:
        assert np.count_nonzero(val == 1.0) == 1
示例#3
0
def plotnCluster(x, y):
    param_range = np.linspace(1, 100, num=100)
    accuracy = []
    times = []
    for n in range(1, 11):
        start = datetime.datetime.now()
        y_predict = GaussianMixture(n_components=n).fit_predict(x)
        y_test_accuracy = metrics.rand_score(y, y_predict)
        stop = datetime.datetime.now()
        accuracy.append(y_test_accuracy * 100)
        times.append(((stop - start).total_seconds()))

    _, axes = plt.subplots(1, 2, figsize=(20, 5))
    axes[0].set_xlabel("Training examples")
    axes[0].set_ylabel("Score")
    axes[0].grid()
    param_range = np.linspace(1, 10, len(accuracy))
    axes[0].plot(param_range, accuracy, label="em", color="blue", lw=2)
    axes[0].legend(loc="best")
    axes[0].set_xlabel("number of clusters")
    axes[0].set_ylabel("Accuracy score %")
    axes[1].grid()
    param_range = np.linspace(1, 10, len(times))
    axes[1].plot(param_range, times, label="em", color="blue", lw=2)
    axes[1].legend(loc="best")
    axes[1].set_xlabel("number of clusters")
    axes[1].set_ylabel("Time in Seconds")
    plt.suptitle('Cancer')
    plt.show()
示例#4
0
def eval_clustering(X, labels, true_labels=[]):

    sil_score = silhouette_score(X, labels, metric='euclidean')
    average_distance_within = avg_dist_within(X, labels)
    rand_sc = rand_score(true_labels, labels) if len(true_labels) != 0 else -1

    return ([sil_score, average_distance_within, rand_sc])
示例#5
0
def print_score(*args):
    ri = metrics.rand_score(*args)
    ami = metrics.adjusted_mutual_info_score(*args)
    h = metrics.homogeneity_score(*args)
    c = metrics.completeness_score(*args)
    v = metrics.v_measure_score(*args)
    print([ri, ami, h, c, v])
def kmeanProcessed(): 
    l = [ica,nmf,pca,randDe]
    module = ["ica","nmf","pca","randDe"]
    count=0
    for z in l:
        print('this is the ' +module[count])
        x, y = z.transformData()
        y_predict = KMeans(n_clusters=2).fit_predict(x)
        score = metrics.rand_score(y, y_predict)
        print(module[count]+ ' kmean rand score')
        print(score)
        print(module[count] +'  done')
        count+=1
示例#7
0
def emProcessed():
    l = [ica, nmf, pca, randDe]
    module = ["ica", "nmf", "pca", "randDe"]
    count = 0
    for z in l:
        print('this is the ' + module[count])
        x, y = z.transformData()
        y_predict = GaussianMixture(n_components=2).fit_predict(x)
        score = metrics.rand_score(y, y_predict)
        print(module[count] + ' em rand score')
        print(score)
        print(module[count] + '  done')
        count += 1
示例#8
0
def test_kmeans_dba():
    """Test implementation of Kmeans using dba."""
    X_train, y_train = load_basic_motions(split="train")
    X_test, y_test = load_basic_motions(split="test")

    num_test_values = 5

    kmeans = TimeSeriesKMeans(
        averaging_method="dba",
        random_state=1,
        n_init=2,
        n_clusters=4,
        init_algorithm="kmeans++",
        metric="dtw",
        distance_params={"window": 0.2},
        average_params={"window": 0.2},
    )
    train_predict = kmeans.fit_predict(X_train.head(num_test_values))
    train_mean_score = metrics.rand_score(y_train[0:num_test_values],
                                          train_predict)

    test_mean_result = kmeans.predict(X_test.head(num_test_values))
    mean_score = metrics.rand_score(y_test[0:num_test_values],
                                    test_mean_result)
    proba = kmeans.predict_proba(X_test.head(num_test_values))

    assert np.array_equal(test_mean_result, expected_results["dba"])
    assert mean_score == expected_score["dba"]
    assert train_mean_score == expected_train_result["dba"]
    assert kmeans.n_iter_ == expected_iters["dba"]
    assert np.array_equal(kmeans.labels_, expected_labels["dba"])
    assert isinstance(kmeans.cluster_centers_, np.ndarray)
    assert proba.shape == (5, 4)

    for val in proba:
        assert np.count_nonzero(val == 1.0) == 1
def main():
    ep = .8
    min = 3
    path = "data/dataset1.txt"

    print str(len(sys.argv))
    if len(sys.argv)<4:
        print "Usage : python dbscan.py <eps> <minPts> <input file path>"
        print "running for default values"
    if len(sys.argv)>1 and sys.argv[1]:
        ep = float(sys.argv[1]);

    if len(sys.argv)>2 and sys.argv[2]:
        min = int(sys.argv[2]);

    if len(sys.argv)>3 and sys.argv[3]:
        path = str(sys.argv[3]);

    si = 2
    if "iyer" in path: si = 3
    X = np.loadtxt(path)[:,si:]
    trueLabels = np.loadtxt(path)[:,1]

    dbscan = DBScan();
    labels = dbscan.fit(X,ep,min)
    # np.savetxt("dbscanLabels.txt",labels.astype(int),fmt='%d')
    # np.savetxt("dbscanout.txt",labels)

    plotPCA(X,path.split("/")[1].split(".")[0]+"_predicted_clusters_min:"+str(min)+"_eps:"+str(ep),True,labels)

    import metrics #code in metrics.py
    print "jaccard coeff"
    jac_metric = metrics.calculateJaccardCoeff(trueLabels,labels)
    print jac_metric

    print "correlation"
    cor =  metrics.computeCorrelation(X,labels)
    print cor

    from sklearn.metrics import adjusted_rand_score as rand_score #library function
    print "adjusted rand score"
    rand_met = rand_score(trueLabels.T,labels.T)
    print rand_met

    # results.append([jac_metric,rand_met,cor,ep,min])
    # print np.corrcoef(X,labels)

    dbscan.plotKnn(X,min)
示例#10
0
def debug_clusterers():
    """Debug clusterers."""
    X_train, y_train = load_basic_motions(split="train", return_type="numpy3d")
    #    X_train, y_train = load_unit_test(split="train", return_type="numpy3d")
    #   X_train2, y_train2 = load_unit_test(split="train", return_type="numpy2d")
    parameters = {"window": 1.0, "epsilon": 50.0, "g": 0.05, "c": 1.0}
    for dist in distances:
        kmeans = TimeSeriesKMeans(
            averaging_method="mean",
            random_state=1,
            n_init=2,
            n_clusters=2,
            init_algorithm="kmeans++",
            metric=dist,
            distance_params=parameters,
        )
        kmeans.fit(X_train)
        y_pred = kmeans.predict(X_train)
        train_rand = metrics.rand_score(y_train, y_pred)
        print('"' + dist + '": ' + str(train_rand) + ",")
示例#11
0
def plotnCluster():
    l = [ica, nmf, pca, randDe]
    module = ["ica", "nmf", "pca", "randDe"]
    count = 0
    _, axes = plt.subplots(1, 2, figsize=(20, 5))

    for z in l:
        accuracy = []
        times = []
        print('this is the ' + module[count])
        x, y = z.transformData()
        for n in range(1, 11):
            start = datetime.datetime.now()
            y_predict = GaussianMixture(n_components=n).fit_predict(x)
            y_test_accuracy = metrics.rand_score(y, y_predict)
            stop = datetime.datetime.now()
            accuracy.append(y_test_accuracy * 100)
            times.append(((stop - start).total_seconds()))
        param_range_a = np.linspace(1, 10, len(accuracy))
        param_range_t = np.linspace(1, 10, len(times))
        axes[0].plot(param_range_a, accuracy, label=module[count], lw=2)
        axes[1].plot(param_range_t, times, label=module[count], lw=2)
        count += 1

    axes[0].set_xlabel("Training examples")
    axes[0].set_ylabel("Score")
    axes[0].grid()
    axes[0].legend(loc="best")
    axes[0].set_xlabel("number of clusters")
    axes[0].set_ylabel("Accuracy score %")
    axes[1].grid()
    axes[1].legend(loc="best")
    axes[1].set_xlabel("number of clusters")
    axes[1].set_ylabel("Time in Seconds")
    plt.suptitle('EM Cancer')
    plt.show()
示例#12
0
def evaluateTfIdf(lables_true,lables_pred):
    print("withTfIdf")
    print("metrics.rand_score",metrics.rand_score(lables_true, lables_pred))
    print("metrics.homogeneity_score",metrics.homogeneity_score(lables_true, lables_pred))
    print("metrics.adjusted_mutual_info_score",metrics.adjusted_mutual_info_score(lables_true, lables_pred))
示例#13
0
def evaluateWordToVec(lables_true,lables_pred):
    print("withWordToVec")
    print("metrics.rand_score",metrics.rand_score(lables_true, lables_pred))
    print("metrics.homogeneity_score",metrics.homogeneity_score(lables_true, lables_pred))
    print("metrics.adjusted_mutual_info_score",metrics.adjusted_mutual_info_score(lables_true, lables_pred))
示例#14
0
def save_murcko_result(murcko_clusters):
    df_dict = {}
    for index, key in enumerate(murcko_clusters.keys()):
        if len(murcko_clusters[key]) > 400:
            for sm in murcko_clusters[key]:
                df_dict[sm] = index


if __name__ == "__main__":

    # read the compounds from murcko_labels.csv
    murcko = pd.read_csv("../data/labels/murcko_labels.csv")

    # get descriptor of each compound in murcko_labels.csv from ../data
    desc_df = read_csvs("../data")
    desc_dict = dict(zip(desc_df.smiles, desc_df.descriptors))
    chemprop_desc = {}
    for index, sm in enumerate(murcko.smiles):
        if not index % 10000:
            print(index)
        desc = desc_dict[sm]
        chemprop_desc[sm] = desc

    # cluster these compounds with kmeans clustering
    print("starting clustering")
    #kmeans = MiniBatchKMeans(n_clusters=21, verbose=1).fit(list(chemprop_desc.values()))
    ap = AffinityPropagation(verbose=True).fit(list(chemprop_desc.values()))
    # evaluate the clustering by external indices
    print("starting rand index computations")
    rand = rand_score(list(murcko.label), kmeans.labels_)
import os
# Change the current sys path
os.chdir(
    "/Users/davidlin/Desktop/School/Master/2021_secondSem/SC/image-segmentation/"
)
from Code.lib import generators
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("white")
import pandas as pd
from hmmlearn import hmm
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.datasets import make_circles
from sklearn import metrics
# ===============================================================================================
### Scenario 1: Simulate three clusters, each of them is 2-d independent bivariate normal distribution
# number of clusters
num_clusters = 3
# sample size
n = 1000
# Specify three mean vectors for these clusters
mu1 = np.array([1, 2]).reshape(2, 1)
mu2 = np.array([18, 5]).reshape(2, 1)
mu3 = np.array([5, 15]).reshape(2, 1)
mu = [mu1, mu2, mu3]
# Specify three covariance matrix for these clusters
Sigma1 = np.array([[2, 0], [0, 2]])
Sigma2 = np.array([[2, 0], [0, 1]])
Sigma3 = np.array([[5, 0], [0, 4]])
示例#16
0
文件: figure6.py 项目: meyer-lab/tHMM
def accuracy():
    """ A Helper function to create more random copies of a population. """
    # Creating a list of populations to analyze over
    list_of_Es = [[
        StateDistribution(E2[1].params[0], E2[1].params[1], E2[1].params[2], a,
                          E2[1].params[4], E2[1].params[5]), E2[1]
    ] for a in np.linspace(4.0, 20.0, num_data_points)]
    list_of_populations = [[
        LineageTree.init_from_parameters(pi, T, E, max_desired_num_cells)
    ] for E in list_of_Es]
    # for the violin plots
    list_of_Es2 = [[
        StateDistribution(E2[1].params[0], E2[1].params[1], E2[1].params[2], a,
                          E2[1].params[4], E2[1].params[5]), E2[1]
    ] for a in np.linspace(4.0, 20.0, num_data_points)]
    list_of_populations2 = [[
        LineageTree.init_from_parameters(pi, T, E, 3 * max_desired_num_cells)
    ] for E in list_of_Es2]

    balanced_score = np.empty(len(list_of_populations))

    for ii, pop in enumerate(list_of_populations):
        ravel_true_states = np.array(
            [cell.state for lineage in pop for cell in lineage.output_lineage])
        all_cells = np.array(
            [cell.obs for lineage in pop for cell in lineage.output_lineage])

        kmeans = KMeans(n_clusters=2).fit(all_cells).labels_
        balanced_score[ii] = 100 * rand_score(ravel_true_states, kmeans)

    # replace x with 1-x if the accuracy is less than 50%
    balanced_score[
        balanced_score < 50.0] = 100.0 - balanced_score[balanced_score < 50.0]

    wass, _, dict_out, _ = commonAnalyze(list_of_populations,
                                         2,
                                         xtype="wass",
                                         list_of_fpi=[pi] * num_data_points,
                                         list_of_fT=[T] * num_data_points,
                                         parallel=True)
    accuracy = dict_out["state_similarity"]
    distribution_df = pd.DataFrame(
        columns=["Distribution Similarity", "G1 lifetime", "State"])
    lineages = [
        list_of_populations2[int(num_data_points * i / 4.)][0]
        for i in range(4)
    ]
    len_lineages = [len(lineage) for lineage in lineages]
    distribution_df["G1 lifetime"] = [(cell.obs[1] + cell.obs[2])
                                      for lineage in lineages
                                      for cell in lineage.output_lineage]
    distribution_df["State"] = [
        "State 1" if cell.state == 0 else "State 2" for lineage in lineages
        for cell in lineage.output_lineage
    ]
    distribution_df["Distribution Similarity"] = len_lineages[0] * ["Same\n" + str(0) + "-" + str(wass[-1] / 4)] +\
        len_lineages[1] * ["Similar\n" + str(wass[-1] / 4) + "-" + str(wass[-1] / 2)] +\
        len_lineages[2] * ["Different\n" + str(wass[-1] / 2) + "-" + str(wass[-1] * 0.75)] +\
        len_lineages[3] * ["Distinct\n>" + str(wass[-1] * 0.75)]

    # for the violin plot (distributions)
    wasser_df = pd.DataFrame(
        columns=["Wasserstein distance", "Random Index Accuracy"])
    wasser_df["Wasserstein distance"] = wass
    wasser_df["Random Index Accuracy"] = accuracy
    wasser_df["KMeans Accuracy"] = balanced_score
    return distribution_df, wasser_df
示例#17
0
def em(x, y):
    y_predict = GaussianMixture(n_components=25).fit_predict(x)
    print(metrics.rand_score(y, y_predict))
示例#18
0
def report_clustering(distance_file,
                      biom_file,
                      metadata_file,
                      num_clusters,
                      verbose,
                      L=2,
                      output_file=None):
    if not isinstance(distance_file, list):
        distance_matrix = CSV.read(distance_file)
    else:
        distance_matrix = distance_file

    if output_file is not None:
        f = open(output_file, 'w')

    output_matrix = []

    AgglomerativeCluster = AgglomerativeClustering(
        n_clusters=num_clusters, affinity='precomputed',
        linkage='complete').fit_predict(distance_matrix)
    KMedoidsCluster = KMedoids(n_clusters=num_clusters,
                               metric='precomputed',
                               method='pam',
                               init='heuristic').fit_predict(distance_matrix)

    PCoA_Samples = BW.extract_samples(biom_file)
    metadata = meta.extract_metadata(metadata_file)
    region_names = []
    for i in range(len(PCoA_Samples)):
        if metadata[PCoA_Samples[i]]['body_site'] not in region_names:
            region_names.append(metadata[PCoA_Samples[i]]['body_site'])
        PCoA_Samples[i] = region_names.index(
            metadata[PCoA_Samples[i]]['body_site'])

    if verbose and L == 1:
        print('Printing results for L1-UniFrac:')
    elif verbose and L == 2:
        print('Printing results for L2-UniFrac:')
    if verbose:
        print('Metric\t\t\t\t\t\t\tAgglomerativeClustering\t\tKMedoids')

    if output_file is not None:
        if L == 1:
            f.write('Printing results for L1-UniFrac:\n')
        elif L == 2:
            f.write('Printing results for L2-UniFrac:\n')
        f.write('Metric\t\t\t\tAgglomerativeClustering\t\t\tKMedoids\n')

    if L == 1:
        output_matrix.append(['Printing results for L1-UniFrac:'])
    if L == 2:
        output_matrix.append(['Printing results for L2-UniFrac:'])
    output_matrix.append(['Metric', 'AgglomerativeClustering', 'KMedoids'])

    RI1 = rand_score(PCoA_Samples, AgglomerativeCluster)
    RI2 = rand_score(PCoA_Samples, KMedoidsCluster)
    if verbose:
        print(f'Rand Index Score:               {RI1}\t\t\t{RI2}')
    ARI1 = adjusted_rand_score(PCoA_Samples, AgglomerativeCluster)
    ARI2 = adjusted_rand_score(PCoA_Samples, KMedoidsCluster)
    if verbose:
        print(f'Adjusted Rand Index Score:      {ARI1}\t\t\t{ARI2}')
    NMI1 = normalized_mutual_info_score(PCoA_Samples, AgglomerativeCluster)
    NMI2 = normalized_mutual_info_score(PCoA_Samples, KMedoidsCluster)
    if verbose:
        print(f'Normalized Mutual Index Score:  {NMI1}\t\t\t{NMI2}')
    AMI1 = adjusted_mutual_info_score(PCoA_Samples, AgglomerativeCluster)
    AMI2 = adjusted_mutual_info_score(PCoA_Samples, KMedoidsCluster)
    if verbose:
        print(f'Adjusted Mutual Info Score:     {AMI1}\t\t\t{AMI2}')
    FM1 = fowlkes_mallows_score(PCoA_Samples, AgglomerativeCluster)
    FM2 = fowlkes_mallows_score(PCoA_Samples, KMedoidsCluster)
    if verbose:
        print(f'Fowlkes Mallows Score:          {FM1}\t\t\t{FM2}')

    if output_file is not None:
        f.write(f'Rand Index Score:               {RI1}\t\t\t{RI2}\n')
        f.write(f'Adjusted Rand Index Score:      {ARI1}\t\t\t{ARI2}\n')
        f.write(f'Normalized Mutual Index Score:  {NMI1}\t\t\t{NMI2}\n')
        f.write(f'Adjusted Mutual Info Score:     {AMI1}\t\t\t{AMI2}\n')
        f.write(f'Fowlkes Mallows Score:          {FM1}\t\t\t{FM2}\n')

    output_matrix.append(['Rand Index Score:', RI1, RI2])
    output_matrix.append(['Adjusted Rand Index Score:', ARI1, ARI2])
    output_matrix.append(['Normalized Mutual Index Score:', NMI1, NMI2])
    output_matrix.append(['Adjusted Mutual Info Score:', AMI1, AMI2])
    output_matrix.append(['Fowlkes Mallows Score:', FM1, FM2])

    return output_matrix
示例#19
0
def kmean(x, y):
    y_predict = KMeans(n_clusters=2).fit_predict(x)
    print(metrics.rand_score(y, y_predict))
示例#20
0
文件: Analyze.py 项目: meyer-lab/tHMM
def Results(tHMMobj, pred_states_by_lineage: list,
            LL: float) -> dict[str, Any]:
    """
    This function calculates several results of fitting a synthetic lineage and stores it in a dictionary.
    The dictionary contains the total number of lineages, the log likelihood of state assignments, and
    the total number of cells. It also contains metrics such as the accuracy of state assignment predictions,
    the distance between two distributions, and the Wasserstein distance between two states.

    """
    # Instantiating a dictionary to hold the various metrics of accuracy and scoring for the results of our method
    results_dict: dict[str, Any]
    results_dict = {}
    # To find the switcher map for states based on log-likelihood
    permutes = list(itertools.permutations(np.arange(tHMMobj.num_states)))
    score_permutes = np.empty(len(permutes))

    pi_arg = tHMMobj.X[0].pi if (tHMMobj.fpi is None) else tHMMobj.fpi
    E_arg = tHMMobj.X[0].E if (tHMMobj.fE is None) else tHMMobj.fE
    T_arg = tHMMobj.X[0].T if (tHMMobj.fT is None) else tHMMobj.fT

    pred_states = tHMMobj.predict()
    for i, perm in enumerate(permutes):
        predState_permute = [[perm[st] for st in st_assgn]
                             for st_assgn in pred_states]
        score_permutes[i] = np.sum(
            tHMMobj.log_score(predState_permute, pi=pi_arg, T=T_arg, E=E_arg))

    # Create switcher map based on the max likelihood of different permutations of state assignments
    switch_map = np.array(permutes[np.argmax(score_permutes)])
    tHMMobj, pred_states = permute_states(tHMMobj, switch_map)
    results_dict["total_number_of_lineages"] = len(tHMMobj.X)
    results_dict["LL"] = LL
    results_dict["total_number_of_cells"] = sum(
        [len(lineage.output_lineage) for lineage in tHMMobj.X])

    true_states_by_lineage = [[cell.state for cell in lineage.output_lineage]
                              for lineage in tHMMobj.X]

    results_dict["transition_matrix_similarity"] = np.linalg.norm(
        tHMMobj.estimate.T - tHMMobj.X[0].T)

    results_dict["pi_similarity"] = np.linalg.norm(tHMMobj.X[0].pi -
                                                   tHMMobj.estimate.pi)

    # Get the estimated parameter values
    results_dict["param_estimates"] = [
        tHMMobj.estimate.E[x].params for x in range(tHMMobj.num_states)
    ]

    # Get the true parameter values
    results_dict["param_trues"] = [
        tHMMobj.X[0].E[x].params for x in range(tHMMobj.num_states)
    ]

    # Get the distance between distributions of two states
    results_dict["distribution distance 0"] = tHMMobj.estimate.E[0].dist(
        tHMMobj.X[0].E[0])
    results_dict["distribution distance 1"] = tHMMobj.estimate.E[1].dist(
        tHMMobj.X[0].E[1])

    # 2. Calculate accuracy after switching states
    results_dict["state_counter"] = np.bincount(pred_states[0])
    results_dict["state_proportions"] = [
        100.0 * i / len(pred_states[0]) for i in results_dict["state_counter"]
    ]
    results_dict["state_proportions_0"] = results_dict["state_proportions"][0]
    results_dict["state_similarity"] = 100.0 * rand_score(
        list(itertools.chain(*true_states_by_lineage)),
        list(itertools.chain(*tHMMobj.predict())))

    # 4. Calculate the Wasserstein distance
    results_dict["wasserstein"] = tHMMobj.X[0].E[0].dist(tHMMobj.X[0].E[1])

    return results_dict
示例#21
0
                         average='macro',
                         zero_division='warn'))
print("Precision - k-Means Clustering:")
print(metrics.precision_score(y_test, y_predicted, average='macro'))
print("F1 - k-Means Clustering:")
print(
    metrics.f1_score(y_test,
                     y_predicted,
                     average='macro',
                     zero_division='warn',
                     labels=np.unique(y_predicted)))

#calculation of rand score
#Similarity measure between two clusterings by considering all pairs of samples and
#counting pairs that are assigned in the same or different clusters in the predicted and true clusterings.
rand = metrics.rand_score(y_test, y_predicted)
print(rand)

#The Rand Index computes a similarity measure between two clusterings by considering all pairs of samples and
#counting pairs that are assigned in the same or different clusters in the predicted and true clusterings.
rand = metrics.adjusted_rand_score(y_test, y_predicted)
print(rand)

#Adjusted Mutual Information (AMI) is an adjustment of the Mutual Information (MI) score to account for chance.
#It accounts for the fact that the MI is generally higher for two clusterings with a larger number of clusters,
#regardless of whether there is actually more information shared
rand = metrics.adjusted_mutual_info_score(y_test, y_predicted)
print(rand)

print(type(y_predicted))
y_predicted = pd.DataFrame(y_predicted)
示例#22
0
    other = dnames[did]
    
    # ok now we mix these 2 and calculate the rand
    #dat = [load.loadgruen_single( f"../data/punk/{dname}",subsample=700)  for dname in currentnames]
    #pp = pp_many.Data().fit(dat,debug_ftsel=False,scale=True, maxgenes = int(1500/len(currentnames))) # TODO: maxgenes for all parts.. together
    r=[]
    for i in range(5):
        dat = [load.loadgruen_single( f"../data/punk/{dname}",subsample=1000)  for dname in [start, other]]
        pp = preprocessing.Data().fit(*dat,
                    debug_ftsel=False,
                    scale=True, 
                    maxgenes = 800)  
        allteX =  np.vstack(pp.dx)
        labels = natto.process.gmm_1(allteX)
        real_labels = [i for d in [pp.a, pp.b] for i in d.obs['true'].values]
        rands = rand_score(real_labels, labels)
        r.append(rands)

    randscr.append(np.array(r).mean())
    print ("RAND", randscr) 

# then do this starting with any 
#  [0.8924611305652826, 0.9637320660330164, 0.9003336668334168, 0.7692594297148575, 0.7688283141570785, 0.7588649324662331]
# this is when we dont subsample for the samples to be even: 
# 0.8940090045022512, 0.9481543771885942, 0.8994602301150575, 0.7643338669334667, 0.76911995997999, 0.7652657328664332

# 2. optimization of clustering algorithms on real labels? 
    # the best algo might not yield the best curve on the noise plot
    # should i just plot 999 noise plots? .... 

# %%
示例#23
0
    mi_edges = np.ma.masked_all((fragments.shape[1], fragments.shape[1]))
    ami_edges = np.ma.masked_all((fragments.shape[1], fragments.shape[1]))
    nmi_edges = np.ma.masked_all((fragments.shape[1], fragments.shape[1]))

    for i, j in tqdm(zip(*np.tril_indices(fragments.shape[1], k=-1)),
                     desc='edges',
                     total=((fragments.shape[1]**2) / 2 - fragments.shape[1])):

        # check if two sites are connected by enough fragments
        connections = get_connections(fragments[:, i], fragments[:, j])

        if connections.sum() < min_connections:
            continue

        # rand index
        rand = rand_score(fragments[:, i][connections],
                          fragments[:, j][connections])
        rand_edges[i, j] = rand
        rand_edges[j, i] = rand

        adj_rand = adjusted_rand_score(fragments[:, i][connections],
                                       fragments[:, j][connections])
        adj_rand_edges[i, j] = adj_rand
        adj_rand_edges[j, i] = adj_rand

        # mutual info
        mi = mutual_info_score(fragments[:, i][connections],
                               fragments[:, j][connections])
        mi_edges[i, j] = mi
        mi_edges[j, i] = mi

        ami = adjusted_mutual_info_score(fragments[:, i][connections],