def test_kmeans(): """Test implementation of Kmeans.""" X_train, y_train = load_basic_motions(split="train") X_test, y_test = load_basic_motions(split="test") kmeans = TimeSeriesKMeans( averaging_method="mean", random_state=1, n_init=2, n_clusters=4, init_algorithm="kmeans++", metric="dtw", ) train_predict = kmeans.fit_predict(X_train) train_mean_score = metrics.rand_score(y_train, train_predict) test_mean_result = kmeans.predict(X_test) mean_score = metrics.rand_score(y_test, test_mean_result) proba = kmeans.predict_proba(X_test) assert np.array_equal(test_mean_result, expected_results["mean"]) assert mean_score == expected_score["mean"] assert train_mean_score == expected_train_result["mean"] assert kmeans.n_iter_ == expected_iters["mean"] assert np.array_equal(kmeans.labels_, expected_labels["mean"]) assert isinstance(kmeans.cluster_centers_, np.ndarray) assert proba.shape == (40, 4) for val in proba: assert np.count_nonzero(val == 1.0) == 1
def test_kmedoids(): """Test implementation of Kmedoids.""" X_train, y_train = load_basic_motions(split="train") X_test, y_test = load_basic_motions(split="test") kmedoids = TimeSeriesKMedoids( random_state=1, n_init=2, max_iter=5, init_algorithm="kmeans++", metric="euclidean", ) train_predict = kmedoids.fit_predict(X_train) train_score = metrics.rand_score(y_train, train_predict) test_medoids_result = kmedoids.predict(X_test) medoids_score = metrics.rand_score(y_test, test_medoids_result) proba = kmedoids.predict_proba(X_test) assert np.array_equal(test_medoids_result, expected_results["medoids"]) assert medoids_score == expected_score["medoids"] assert train_score == train_expected_score["medoids"] assert np.isclose(kmedoids.inertia_, expected_inertia["medoids"]) assert kmedoids.n_iter_ == expected_iters["medoids"] assert np.array_equal(kmedoids.labels_, expected_labels["medoids"]) assert isinstance(kmedoids.cluster_centers_, np.ndarray) assert proba.shape == (40, 8) for val in proba: assert np.count_nonzero(val == 1.0) == 1
def plotnCluster(x, y): param_range = np.linspace(1, 100, num=100) accuracy = [] times = [] for n in range(1, 11): start = datetime.datetime.now() y_predict = GaussianMixture(n_components=n).fit_predict(x) y_test_accuracy = metrics.rand_score(y, y_predict) stop = datetime.datetime.now() accuracy.append(y_test_accuracy * 100) times.append(((stop - start).total_seconds())) _, axes = plt.subplots(1, 2, figsize=(20, 5)) axes[0].set_xlabel("Training examples") axes[0].set_ylabel("Score") axes[0].grid() param_range = np.linspace(1, 10, len(accuracy)) axes[0].plot(param_range, accuracy, label="em", color="blue", lw=2) axes[0].legend(loc="best") axes[0].set_xlabel("number of clusters") axes[0].set_ylabel("Accuracy score %") axes[1].grid() param_range = np.linspace(1, 10, len(times)) axes[1].plot(param_range, times, label="em", color="blue", lw=2) axes[1].legend(loc="best") axes[1].set_xlabel("number of clusters") axes[1].set_ylabel("Time in Seconds") plt.suptitle('Cancer') plt.show()
def eval_clustering(X, labels, true_labels=[]): sil_score = silhouette_score(X, labels, metric='euclidean') average_distance_within = avg_dist_within(X, labels) rand_sc = rand_score(true_labels, labels) if len(true_labels) != 0 else -1 return ([sil_score, average_distance_within, rand_sc])
def print_score(*args): ri = metrics.rand_score(*args) ami = metrics.adjusted_mutual_info_score(*args) h = metrics.homogeneity_score(*args) c = metrics.completeness_score(*args) v = metrics.v_measure_score(*args) print([ri, ami, h, c, v])
def kmeanProcessed(): l = [ica,nmf,pca,randDe] module = ["ica","nmf","pca","randDe"] count=0 for z in l: print('this is the ' +module[count]) x, y = z.transformData() y_predict = KMeans(n_clusters=2).fit_predict(x) score = metrics.rand_score(y, y_predict) print(module[count]+ ' kmean rand score') print(score) print(module[count] +' done') count+=1
def emProcessed(): l = [ica, nmf, pca, randDe] module = ["ica", "nmf", "pca", "randDe"] count = 0 for z in l: print('this is the ' + module[count]) x, y = z.transformData() y_predict = GaussianMixture(n_components=2).fit_predict(x) score = metrics.rand_score(y, y_predict) print(module[count] + ' em rand score') print(score) print(module[count] + ' done') count += 1
def test_kmeans_dba(): """Test implementation of Kmeans using dba.""" X_train, y_train = load_basic_motions(split="train") X_test, y_test = load_basic_motions(split="test") num_test_values = 5 kmeans = TimeSeriesKMeans( averaging_method="dba", random_state=1, n_init=2, n_clusters=4, init_algorithm="kmeans++", metric="dtw", distance_params={"window": 0.2}, average_params={"window": 0.2}, ) train_predict = kmeans.fit_predict(X_train.head(num_test_values)) train_mean_score = metrics.rand_score(y_train[0:num_test_values], train_predict) test_mean_result = kmeans.predict(X_test.head(num_test_values)) mean_score = metrics.rand_score(y_test[0:num_test_values], test_mean_result) proba = kmeans.predict_proba(X_test.head(num_test_values)) assert np.array_equal(test_mean_result, expected_results["dba"]) assert mean_score == expected_score["dba"] assert train_mean_score == expected_train_result["dba"] assert kmeans.n_iter_ == expected_iters["dba"] assert np.array_equal(kmeans.labels_, expected_labels["dba"]) assert isinstance(kmeans.cluster_centers_, np.ndarray) assert proba.shape == (5, 4) for val in proba: assert np.count_nonzero(val == 1.0) == 1
def main(): ep = .8 min = 3 path = "data/dataset1.txt" print str(len(sys.argv)) if len(sys.argv)<4: print "Usage : python dbscan.py <eps> <minPts> <input file path>" print "running for default values" if len(sys.argv)>1 and sys.argv[1]: ep = float(sys.argv[1]); if len(sys.argv)>2 and sys.argv[2]: min = int(sys.argv[2]); if len(sys.argv)>3 and sys.argv[3]: path = str(sys.argv[3]); si = 2 if "iyer" in path: si = 3 X = np.loadtxt(path)[:,si:] trueLabels = np.loadtxt(path)[:,1] dbscan = DBScan(); labels = dbscan.fit(X,ep,min) # np.savetxt("dbscanLabels.txt",labels.astype(int),fmt='%d') # np.savetxt("dbscanout.txt",labels) plotPCA(X,path.split("/")[1].split(".")[0]+"_predicted_clusters_min:"+str(min)+"_eps:"+str(ep),True,labels) import metrics #code in metrics.py print "jaccard coeff" jac_metric = metrics.calculateJaccardCoeff(trueLabels,labels) print jac_metric print "correlation" cor = metrics.computeCorrelation(X,labels) print cor from sklearn.metrics import adjusted_rand_score as rand_score #library function print "adjusted rand score" rand_met = rand_score(trueLabels.T,labels.T) print rand_met # results.append([jac_metric,rand_met,cor,ep,min]) # print np.corrcoef(X,labels) dbscan.plotKnn(X,min)
def debug_clusterers(): """Debug clusterers.""" X_train, y_train = load_basic_motions(split="train", return_type="numpy3d") # X_train, y_train = load_unit_test(split="train", return_type="numpy3d") # X_train2, y_train2 = load_unit_test(split="train", return_type="numpy2d") parameters = {"window": 1.0, "epsilon": 50.0, "g": 0.05, "c": 1.0} for dist in distances: kmeans = TimeSeriesKMeans( averaging_method="mean", random_state=1, n_init=2, n_clusters=2, init_algorithm="kmeans++", metric=dist, distance_params=parameters, ) kmeans.fit(X_train) y_pred = kmeans.predict(X_train) train_rand = metrics.rand_score(y_train, y_pred) print('"' + dist + '": ' + str(train_rand) + ",")
def plotnCluster(): l = [ica, nmf, pca, randDe] module = ["ica", "nmf", "pca", "randDe"] count = 0 _, axes = plt.subplots(1, 2, figsize=(20, 5)) for z in l: accuracy = [] times = [] print('this is the ' + module[count]) x, y = z.transformData() for n in range(1, 11): start = datetime.datetime.now() y_predict = GaussianMixture(n_components=n).fit_predict(x) y_test_accuracy = metrics.rand_score(y, y_predict) stop = datetime.datetime.now() accuracy.append(y_test_accuracy * 100) times.append(((stop - start).total_seconds())) param_range_a = np.linspace(1, 10, len(accuracy)) param_range_t = np.linspace(1, 10, len(times)) axes[0].plot(param_range_a, accuracy, label=module[count], lw=2) axes[1].plot(param_range_t, times, label=module[count], lw=2) count += 1 axes[0].set_xlabel("Training examples") axes[0].set_ylabel("Score") axes[0].grid() axes[0].legend(loc="best") axes[0].set_xlabel("number of clusters") axes[0].set_ylabel("Accuracy score %") axes[1].grid() axes[1].legend(loc="best") axes[1].set_xlabel("number of clusters") axes[1].set_ylabel("Time in Seconds") plt.suptitle('EM Cancer') plt.show()
def evaluateTfIdf(lables_true,lables_pred): print("withTfIdf") print("metrics.rand_score",metrics.rand_score(lables_true, lables_pred)) print("metrics.homogeneity_score",metrics.homogeneity_score(lables_true, lables_pred)) print("metrics.adjusted_mutual_info_score",metrics.adjusted_mutual_info_score(lables_true, lables_pred))
def evaluateWordToVec(lables_true,lables_pred): print("withWordToVec") print("metrics.rand_score",metrics.rand_score(lables_true, lables_pred)) print("metrics.homogeneity_score",metrics.homogeneity_score(lables_true, lables_pred)) print("metrics.adjusted_mutual_info_score",metrics.adjusted_mutual_info_score(lables_true, lables_pred))
def save_murcko_result(murcko_clusters): df_dict = {} for index, key in enumerate(murcko_clusters.keys()): if len(murcko_clusters[key]) > 400: for sm in murcko_clusters[key]: df_dict[sm] = index if __name__ == "__main__": # read the compounds from murcko_labels.csv murcko = pd.read_csv("../data/labels/murcko_labels.csv") # get descriptor of each compound in murcko_labels.csv from ../data desc_df = read_csvs("../data") desc_dict = dict(zip(desc_df.smiles, desc_df.descriptors)) chemprop_desc = {} for index, sm in enumerate(murcko.smiles): if not index % 10000: print(index) desc = desc_dict[sm] chemprop_desc[sm] = desc # cluster these compounds with kmeans clustering print("starting clustering") #kmeans = MiniBatchKMeans(n_clusters=21, verbose=1).fit(list(chemprop_desc.values())) ap = AffinityPropagation(verbose=True).fit(list(chemprop_desc.values())) # evaluate the clustering by external indices print("starting rand index computations") rand = rand_score(list(murcko.label), kmeans.labels_)
import os # Change the current sys path os.chdir( "/Users/davidlin/Desktop/School/Master/2021_secondSem/SC/image-segmentation/" ) from Code.lib import generators import numpy as np import matplotlib.pyplot as plt import seaborn as sns sns.set_style("white") import pandas as pd from hmmlearn import hmm from sklearn.cluster import KMeans, SpectralClustering from sklearn.datasets import make_circles from sklearn import metrics # =============================================================================================== ### Scenario 1: Simulate three clusters, each of them is 2-d independent bivariate normal distribution # number of clusters num_clusters = 3 # sample size n = 1000 # Specify three mean vectors for these clusters mu1 = np.array([1, 2]).reshape(2, 1) mu2 = np.array([18, 5]).reshape(2, 1) mu3 = np.array([5, 15]).reshape(2, 1) mu = [mu1, mu2, mu3] # Specify three covariance matrix for these clusters Sigma1 = np.array([[2, 0], [0, 2]]) Sigma2 = np.array([[2, 0], [0, 1]]) Sigma3 = np.array([[5, 0], [0, 4]])
def accuracy(): """ A Helper function to create more random copies of a population. """ # Creating a list of populations to analyze over list_of_Es = [[ StateDistribution(E2[1].params[0], E2[1].params[1], E2[1].params[2], a, E2[1].params[4], E2[1].params[5]), E2[1] ] for a in np.linspace(4.0, 20.0, num_data_points)] list_of_populations = [[ LineageTree.init_from_parameters(pi, T, E, max_desired_num_cells) ] for E in list_of_Es] # for the violin plots list_of_Es2 = [[ StateDistribution(E2[1].params[0], E2[1].params[1], E2[1].params[2], a, E2[1].params[4], E2[1].params[5]), E2[1] ] for a in np.linspace(4.0, 20.0, num_data_points)] list_of_populations2 = [[ LineageTree.init_from_parameters(pi, T, E, 3 * max_desired_num_cells) ] for E in list_of_Es2] balanced_score = np.empty(len(list_of_populations)) for ii, pop in enumerate(list_of_populations): ravel_true_states = np.array( [cell.state for lineage in pop for cell in lineage.output_lineage]) all_cells = np.array( [cell.obs for lineage in pop for cell in lineage.output_lineage]) kmeans = KMeans(n_clusters=2).fit(all_cells).labels_ balanced_score[ii] = 100 * rand_score(ravel_true_states, kmeans) # replace x with 1-x if the accuracy is less than 50% balanced_score[ balanced_score < 50.0] = 100.0 - balanced_score[balanced_score < 50.0] wass, _, dict_out, _ = commonAnalyze(list_of_populations, 2, xtype="wass", list_of_fpi=[pi] * num_data_points, list_of_fT=[T] * num_data_points, parallel=True) accuracy = dict_out["state_similarity"] distribution_df = pd.DataFrame( columns=["Distribution Similarity", "G1 lifetime", "State"]) lineages = [ list_of_populations2[int(num_data_points * i / 4.)][0] for i in range(4) ] len_lineages = [len(lineage) for lineage in lineages] distribution_df["G1 lifetime"] = [(cell.obs[1] + cell.obs[2]) for lineage in lineages for cell in lineage.output_lineage] distribution_df["State"] = [ "State 1" if cell.state == 0 else "State 2" for lineage in lineages for cell in lineage.output_lineage ] distribution_df["Distribution Similarity"] = len_lineages[0] * ["Same\n" + str(0) + "-" + str(wass[-1] / 4)] +\ len_lineages[1] * ["Similar\n" + str(wass[-1] / 4) + "-" + str(wass[-1] / 2)] +\ len_lineages[2] * ["Different\n" + str(wass[-1] / 2) + "-" + str(wass[-1] * 0.75)] +\ len_lineages[3] * ["Distinct\n>" + str(wass[-1] * 0.75)] # for the violin plot (distributions) wasser_df = pd.DataFrame( columns=["Wasserstein distance", "Random Index Accuracy"]) wasser_df["Wasserstein distance"] = wass wasser_df["Random Index Accuracy"] = accuracy wasser_df["KMeans Accuracy"] = balanced_score return distribution_df, wasser_df
def em(x, y): y_predict = GaussianMixture(n_components=25).fit_predict(x) print(metrics.rand_score(y, y_predict))
def report_clustering(distance_file, biom_file, metadata_file, num_clusters, verbose, L=2, output_file=None): if not isinstance(distance_file, list): distance_matrix = CSV.read(distance_file) else: distance_matrix = distance_file if output_file is not None: f = open(output_file, 'w') output_matrix = [] AgglomerativeCluster = AgglomerativeClustering( n_clusters=num_clusters, affinity='precomputed', linkage='complete').fit_predict(distance_matrix) KMedoidsCluster = KMedoids(n_clusters=num_clusters, metric='precomputed', method='pam', init='heuristic').fit_predict(distance_matrix) PCoA_Samples = BW.extract_samples(biom_file) metadata = meta.extract_metadata(metadata_file) region_names = [] for i in range(len(PCoA_Samples)): if metadata[PCoA_Samples[i]]['body_site'] not in region_names: region_names.append(metadata[PCoA_Samples[i]]['body_site']) PCoA_Samples[i] = region_names.index( metadata[PCoA_Samples[i]]['body_site']) if verbose and L == 1: print('Printing results for L1-UniFrac:') elif verbose and L == 2: print('Printing results for L2-UniFrac:') if verbose: print('Metric\t\t\t\t\t\t\tAgglomerativeClustering\t\tKMedoids') if output_file is not None: if L == 1: f.write('Printing results for L1-UniFrac:\n') elif L == 2: f.write('Printing results for L2-UniFrac:\n') f.write('Metric\t\t\t\tAgglomerativeClustering\t\t\tKMedoids\n') if L == 1: output_matrix.append(['Printing results for L1-UniFrac:']) if L == 2: output_matrix.append(['Printing results for L2-UniFrac:']) output_matrix.append(['Metric', 'AgglomerativeClustering', 'KMedoids']) RI1 = rand_score(PCoA_Samples, AgglomerativeCluster) RI2 = rand_score(PCoA_Samples, KMedoidsCluster) if verbose: print(f'Rand Index Score: {RI1}\t\t\t{RI2}') ARI1 = adjusted_rand_score(PCoA_Samples, AgglomerativeCluster) ARI2 = adjusted_rand_score(PCoA_Samples, KMedoidsCluster) if verbose: print(f'Adjusted Rand Index Score: {ARI1}\t\t\t{ARI2}') NMI1 = normalized_mutual_info_score(PCoA_Samples, AgglomerativeCluster) NMI2 = normalized_mutual_info_score(PCoA_Samples, KMedoidsCluster) if verbose: print(f'Normalized Mutual Index Score: {NMI1}\t\t\t{NMI2}') AMI1 = adjusted_mutual_info_score(PCoA_Samples, AgglomerativeCluster) AMI2 = adjusted_mutual_info_score(PCoA_Samples, KMedoidsCluster) if verbose: print(f'Adjusted Mutual Info Score: {AMI1}\t\t\t{AMI2}') FM1 = fowlkes_mallows_score(PCoA_Samples, AgglomerativeCluster) FM2 = fowlkes_mallows_score(PCoA_Samples, KMedoidsCluster) if verbose: print(f'Fowlkes Mallows Score: {FM1}\t\t\t{FM2}') if output_file is not None: f.write(f'Rand Index Score: {RI1}\t\t\t{RI2}\n') f.write(f'Adjusted Rand Index Score: {ARI1}\t\t\t{ARI2}\n') f.write(f'Normalized Mutual Index Score: {NMI1}\t\t\t{NMI2}\n') f.write(f'Adjusted Mutual Info Score: {AMI1}\t\t\t{AMI2}\n') f.write(f'Fowlkes Mallows Score: {FM1}\t\t\t{FM2}\n') output_matrix.append(['Rand Index Score:', RI1, RI2]) output_matrix.append(['Adjusted Rand Index Score:', ARI1, ARI2]) output_matrix.append(['Normalized Mutual Index Score:', NMI1, NMI2]) output_matrix.append(['Adjusted Mutual Info Score:', AMI1, AMI2]) output_matrix.append(['Fowlkes Mallows Score:', FM1, FM2]) return output_matrix
def kmean(x, y): y_predict = KMeans(n_clusters=2).fit_predict(x) print(metrics.rand_score(y, y_predict))
def Results(tHMMobj, pred_states_by_lineage: list, LL: float) -> dict[str, Any]: """ This function calculates several results of fitting a synthetic lineage and stores it in a dictionary. The dictionary contains the total number of lineages, the log likelihood of state assignments, and the total number of cells. It also contains metrics such as the accuracy of state assignment predictions, the distance between two distributions, and the Wasserstein distance between two states. """ # Instantiating a dictionary to hold the various metrics of accuracy and scoring for the results of our method results_dict: dict[str, Any] results_dict = {} # To find the switcher map for states based on log-likelihood permutes = list(itertools.permutations(np.arange(tHMMobj.num_states))) score_permutes = np.empty(len(permutes)) pi_arg = tHMMobj.X[0].pi if (tHMMobj.fpi is None) else tHMMobj.fpi E_arg = tHMMobj.X[0].E if (tHMMobj.fE is None) else tHMMobj.fE T_arg = tHMMobj.X[0].T if (tHMMobj.fT is None) else tHMMobj.fT pred_states = tHMMobj.predict() for i, perm in enumerate(permutes): predState_permute = [[perm[st] for st in st_assgn] for st_assgn in pred_states] score_permutes[i] = np.sum( tHMMobj.log_score(predState_permute, pi=pi_arg, T=T_arg, E=E_arg)) # Create switcher map based on the max likelihood of different permutations of state assignments switch_map = np.array(permutes[np.argmax(score_permutes)]) tHMMobj, pred_states = permute_states(tHMMobj, switch_map) results_dict["total_number_of_lineages"] = len(tHMMobj.X) results_dict["LL"] = LL results_dict["total_number_of_cells"] = sum( [len(lineage.output_lineage) for lineage in tHMMobj.X]) true_states_by_lineage = [[cell.state for cell in lineage.output_lineage] for lineage in tHMMobj.X] results_dict["transition_matrix_similarity"] = np.linalg.norm( tHMMobj.estimate.T - tHMMobj.X[0].T) results_dict["pi_similarity"] = np.linalg.norm(tHMMobj.X[0].pi - tHMMobj.estimate.pi) # Get the estimated parameter values results_dict["param_estimates"] = [ tHMMobj.estimate.E[x].params for x in range(tHMMobj.num_states) ] # Get the true parameter values results_dict["param_trues"] = [ tHMMobj.X[0].E[x].params for x in range(tHMMobj.num_states) ] # Get the distance between distributions of two states results_dict["distribution distance 0"] = tHMMobj.estimate.E[0].dist( tHMMobj.X[0].E[0]) results_dict["distribution distance 1"] = tHMMobj.estimate.E[1].dist( tHMMobj.X[0].E[1]) # 2. Calculate accuracy after switching states results_dict["state_counter"] = np.bincount(pred_states[0]) results_dict["state_proportions"] = [ 100.0 * i / len(pred_states[0]) for i in results_dict["state_counter"] ] results_dict["state_proportions_0"] = results_dict["state_proportions"][0] results_dict["state_similarity"] = 100.0 * rand_score( list(itertools.chain(*true_states_by_lineage)), list(itertools.chain(*tHMMobj.predict()))) # 4. Calculate the Wasserstein distance results_dict["wasserstein"] = tHMMobj.X[0].E[0].dist(tHMMobj.X[0].E[1]) return results_dict
average='macro', zero_division='warn')) print("Precision - k-Means Clustering:") print(metrics.precision_score(y_test, y_predicted, average='macro')) print("F1 - k-Means Clustering:") print( metrics.f1_score(y_test, y_predicted, average='macro', zero_division='warn', labels=np.unique(y_predicted))) #calculation of rand score #Similarity measure between two clusterings by considering all pairs of samples and #counting pairs that are assigned in the same or different clusters in the predicted and true clusterings. rand = metrics.rand_score(y_test, y_predicted) print(rand) #The Rand Index computes a similarity measure between two clusterings by considering all pairs of samples and #counting pairs that are assigned in the same or different clusters in the predicted and true clusterings. rand = metrics.adjusted_rand_score(y_test, y_predicted) print(rand) #Adjusted Mutual Information (AMI) is an adjustment of the Mutual Information (MI) score to account for chance. #It accounts for the fact that the MI is generally higher for two clusterings with a larger number of clusters, #regardless of whether there is actually more information shared rand = metrics.adjusted_mutual_info_score(y_test, y_predicted) print(rand) print(type(y_predicted)) y_predicted = pd.DataFrame(y_predicted)
other = dnames[did] # ok now we mix these 2 and calculate the rand #dat = [load.loadgruen_single( f"../data/punk/{dname}",subsample=700) for dname in currentnames] #pp = pp_many.Data().fit(dat,debug_ftsel=False,scale=True, maxgenes = int(1500/len(currentnames))) # TODO: maxgenes for all parts.. together r=[] for i in range(5): dat = [load.loadgruen_single( f"../data/punk/{dname}",subsample=1000) for dname in [start, other]] pp = preprocessing.Data().fit(*dat, debug_ftsel=False, scale=True, maxgenes = 800) allteX = np.vstack(pp.dx) labels = natto.process.gmm_1(allteX) real_labels = [i for d in [pp.a, pp.b] for i in d.obs['true'].values] rands = rand_score(real_labels, labels) r.append(rands) randscr.append(np.array(r).mean()) print ("RAND", randscr) # then do this starting with any # [0.8924611305652826, 0.9637320660330164, 0.9003336668334168, 0.7692594297148575, 0.7688283141570785, 0.7588649324662331] # this is when we dont subsample for the samples to be even: # 0.8940090045022512, 0.9481543771885942, 0.8994602301150575, 0.7643338669334667, 0.76911995997999, 0.7652657328664332 # 2. optimization of clustering algorithms on real labels? # the best algo might not yield the best curve on the noise plot # should i just plot 999 noise plots? .... # %%
mi_edges = np.ma.masked_all((fragments.shape[1], fragments.shape[1])) ami_edges = np.ma.masked_all((fragments.shape[1], fragments.shape[1])) nmi_edges = np.ma.masked_all((fragments.shape[1], fragments.shape[1])) for i, j in tqdm(zip(*np.tril_indices(fragments.shape[1], k=-1)), desc='edges', total=((fragments.shape[1]**2) / 2 - fragments.shape[1])): # check if two sites are connected by enough fragments connections = get_connections(fragments[:, i], fragments[:, j]) if connections.sum() < min_connections: continue # rand index rand = rand_score(fragments[:, i][connections], fragments[:, j][connections]) rand_edges[i, j] = rand rand_edges[j, i] = rand adj_rand = adjusted_rand_score(fragments[:, i][connections], fragments[:, j][connections]) adj_rand_edges[i, j] = adj_rand adj_rand_edges[j, i] = adj_rand # mutual info mi = mutual_info_score(fragments[:, i][connections], fragments[:, j][connections]) mi_edges[i, j] = mi mi_edges[j, i] = mi ami = adjusted_mutual_info_score(fragments[:, i][connections],