def estimate_assignments(graph, n_communities, n_components=None, method="gc", metric=None): """Given a graph and n_comunities, sweeps over covariance structures Not deterministic Not using graph bic or mse to calculate best 1. Does an embedding on the raw graph 2. GaussianCluster on the embedding. This will sweep covariance structure for the given n_communities 3. Returns n_parameters based on the number used in GaussianCluster method can be "gc" or "bc" method "gc" : use graspy GaussianCluster this defaults to full covariance "bc" : tommyclust with defaults so sweep covariance, agglom, linkage "bc-metric" : tommyclust with custom metric still sweep everything "bc-none" : mostly for testing, should behave just like GaussianCluster """ embed_graph = graph.copy() latent = AdjacencySpectralEmbed( n_components=n_components).fit_transform(embed_graph) if isinstance(latent, tuple): latent = np.concatenate(latent, axis=1) if method == "gc": gc = GaussianCluster( min_components=n_communities, max_components=n_communities, covariance_type="all", ) vertex_assignments = gc.fit_predict(latent) n_params = gc.model_._n_parameters() elif method == "bc": vertex_assignments, n_params = brute_cluster(latent, [n_communities]) elif method == "bc-metric": vertex_assignments, n_params = brute_cluster(latent, [n_communities], metric=metric) elif method == "bc-none": vertex_assignments, n_params = brute_cluster( latent, [n_communities], affinities=["none"], linkages=["none"], covariance_types=["full"], ) else: raise ValueError("Unspecified clustering method") return (vertex_assignments, n_params)
def brute_graspy_cluster(Ns, x, covariance_types, ks, c_true, savefigs=None, graphList=None): if graphList != None and 'all_bics' in graphList: _, ((ax0, ax1), (ax2, ax3)) = plt.subplots(2, 2, sharey='row', sharex='col', figsize=(10, 10)) titles = ['full', 'tied', 'diag', 'spherical'] best_bic = -np.inf for N in Ns: bics = np.zeros([len(ks), len(covariance_types), N]) aris = np.zeros([len(ks), len(covariance_types), N]) for i in np.arange(N): graspy_gmm = GaussianCluster(min_components=ks[0], max_components=ks[len(ks) - 1], covariance_type=covariance_types, random_state=i) c_hat, ari = graspy_gmm.fit_predict(x, y=c_true) bic_values = -graspy_gmm.bic_.values ari_values = graspy_gmm.ari_.values bics[:, :, i] = bic_values aris[:, :, i] = ari_values bic = bic_values.max() if bic > best_bic: idx = np.argmax(bic_values) idxs = np.unravel_index(idx, bic_values.shape) best_ari_bic = ari best_bic = bic best_k_bic = ks[idxs[0]] best_cov_bic = titles[3 - idxs[1]] best_c_hat_bic = c_hat max_bics = np.amax(bics, axis=2) title = 'N=' + str(N) if graphList != None and 'all_bics' in graphList: ax0.plot(np.arange(1, len(ks) + 1), max_bics[:, 3]) ax1.plot(np.arange(1, len(ks) + 1), max_bics[:, 2], label=title) ax2.plot(np.arange(1, len(ks) + 1), max_bics[:, 1]) ax3.plot(np.arange(1, len(ks) + 1), max_bics[:, 0]) if graphList != None and 'best_bic' in graphList: #Plot with best BIC********************************* if c_true is None: best_ari_bic_str = 'NA' else: best_ari_bic_str = '%1.3f' % best_ari_bic fig_bestbic = plt.figure(figsize=(8, 8)) ax_bestbic = fig_bestbic.add_subplot(1, 1, 1) #ptcolors = [colors[i] for i in best_c_hat_bic] ax_bestbic.scatter(x[:, 0], x[:, 1], c=best_c_hat_bic) #mncolors = [colors[i] for i in np.arange(best_k_bic)] mncolors = [i for i in np.arange(best_k_bic)] ax_bestbic.set_title( "py(agg-gmm) BIC %3.0f from " % best_bic + str(best_cov_bic) + " k=" + str(best_k_bic) + ' ari=' + best_ari_bic_str) # + "iter=" + str(best_iter_bic)) ax_bestbic.set_xlabel("First feature") ax_bestbic.set_ylabel("Second feature") if savefigs is not None: plt.savefig(savefigs + '_python_bestbic.jpg') if graphList != None and 'all_bics' in graphList: #plot of all BICS******************************* titles = ['full', 'tied', 'diag', 'spherical'] #ax0.set_title(titles[0],fontsize=20,fontweight='bold') #ax0.set_ylabel('BIC',fontsize=20) ax0.locator_params(axis='y', tight=True, nbins=4) ax0.set_yticklabels(ax0.get_yticks(), fontsize=14) #ax1.set_title(titles[1],fontsize=20,fontweight='bold') legend = ax1.legend(loc='best', title='Number of\nRuns', fontsize=12) plt.setp(legend.get_title(), fontsize=14) #ax2.set_title(titles[2],fontsize=20,fontweight='bold') #ax2.set_xlabel('Number of components',fontsize=20) ax2.set_xticks(np.arange(0, 21, 4)) ax2.set_xticklabels(ax2.get_xticks(), fontsize=14) #ax2.set_ylabel('BIC',fontsize=20) ax2.locator_params(axis='y', tight=True, nbins=4) ax2.set_yticklabels(ax2.get_yticks(), fontsize=14) #ax3.set_title(titles[3],fontsize=20,fontweight='bold') #ax3.set_xlabel('Number of components',fontsize=20) ax3.set_xticks(np.arange(0, 21, 4)) ax3.set_xticklabels(ax3.get_xticks(), fontsize=14) if savefigs is not None: plt.savefig('.\\figures\\25_6_19_paperv2\\' + savefigs + '_graspy_bicplot2.jpg') plt.show() return best_c_hat_bic, best_cov_bic, best_k_bic, best_ari_bic, best_bic
adj = pass_to_ranks(adj) lap = to_laplace(adj, form="R-DAD") ase = AdjacencySpectralEmbed(n_components=n_components) latent = ase.fit_transform(lap) latent = np.concatenate(latent, axis=-1) return latent n_components = None k = 30 latent = lse(adj, n_components, regularizer=None) gmm = GaussianCluster(min_components=k, max_components=k) pred_labels = gmm.fit_predict(latent) stacked_barplot(pred_labels, class_labels, palette="tab20") # %% [markdown] # # verify on sklearn toy dataset from sklearn.datasets import make_blobs from sklearn.cluster import KMeans X, y = make_blobs(n_samples=200, n_features=3, centers=None, cluster_std=3) # y = y.astype(int).astype(str) data_df = pd.DataFrame( data=np.concatenate((X, y[:, np.newaxis]), axis=-1), columns=("Dim 0", "Dim 1", "Dim 2", "Labels"), ) # data_df["Labels"] = data_df["Labels"].values.astype("<U10")
#- BIC bic_ = 2 * likeli - temp_n_params * np.log(n) #- ARI ari_ = ari(true_labels, temp_c_hat) return [combo, likeli, ari_, bic_] np.random.seed(16661) A = binarize(right_adj) X_hat = np.concatenate(ASE(n_components=3).fit_transform(A), axis=1) n, d = X_hat.shape gclust = GCLUST(max_components=15) est_labels = gclust.fit_predict(X_hat) loglikelihoods = [np.sum(gclust.model_.score_samples(X_hat))] combos = [None] aris = [ari(right_labels, est_labels)] bic = [gclust.model_.bic(X_hat)] unique_labels = np.unique(est_labels) class_idx = np.array([np.where(est_labels == u)[0] for u in unique_labels]) for k in range(len(unique_labels)): for combo in list(combinations(np.unique(est_labels), k + 1)): combo = np.array(list(combo)).astype(int) combos.append(combo)
n = 1000 pi = 0.9 A, counts = generate_cyclops(X, n, pi, None) c = [0] * counts[0] c += [1] * counts[1] ase = ASE(n_components=3) X_hat = ase.fit_transform(A) fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.scatter(X_hat[:, 0], X_hat[:, 1], X_hat[:, 2], c=c) gclust = GCLUST(max_components=4) c_hat = gclust.fit_predict(X_hat) fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.scatter(X_hat[:, 0], X_hat[:, 1], X_hat[:, 2], c=c_hat) def quadratic(data, params): if data.ndim == 1: sum_ = np.sum(data[:-1]**2 * params[:-1]) + params[-1] return sum_ elif data.ndim == 2: sums = np.sum(data[:, :-1]**2 * params[:-1], axis=1) + params[-1] return sums else: raise ValueError("unsuppored data")
unknown = classes == "Other" plot_unknown = np.tile(unknown, n_graphs) pairplot(plot_latent, labels=plot_unknown, alpha=0.3, legend_name="Unknown") clust_latent = np.concatenate(list(latent), axis=-1) clust_latent.shape #%% gc = GaussianCluster(min_components=2, max_components=15, covariance_type="all") filterwarnings("ignore") n_init = 50 sim_mat = np.zeros((n_verts, n_verts)) for i in tqdm(range(n_init)): assignments = gc.fit_predict(clust_latent) for c in np.unique(assignments): inds = np.where(assignments == c)[0] sim_mat[np.ix_(inds, inds)] += 1 sim_mat -= np.diag(np.diag(sim_mat)) sim_mat = sim_mat / n_init heatmap(sim_mat) #%% thresh_sim_mat = sim_mat.copy() thresh_sim_mat[thresh_sim_mat > 0.5] = 1 thresh_sim_mat[thresh_sim_mat < 0.5] = 0