def one_iteration(start_labels, class_key="Merge Class"): # generate walks data, bins, classes = random_walk_classes(start_labels, seed=None, class_key=class_key) log_data = np.log10(data + 1) # plot the clustermap path_clustermap(log_data, classes, bins) # embed and plot by known class embedding = PCA(n_components=8).fit_transform(log_data) pairplot(embedding, labels=classes, palette=CLASS_COLOR_DICT) # cluster agm = AutoGMMCluster(min_components=2, max_components=20, n_jobs=-1, verbose=10) pred_labels = agm.fit_predict(embedding) plt.figure() sns.scatterplot(data=agm.results_, x="n_components", y="bic/aic") # plot embedding by cluster pairplot(embedding, labels=pred_labels, palette=cc.glasbey_light) # plot predicted clusters by known class stacked_barplot(pred_labels, classes, color_dict=CLASS_COLOR_DICT) return pred_labels
def fit(self, X, y=None): n_samples = X.shape[0] self.n_samples_ = n_samples self.cum_dist_ = 0 if n_samples > self.min_split_samples: if self.cluster_method == "graspy-gmm": cluster = GaussianCluster( min_components=1, max_components=2, n_init=self.n_init, covariance_type="all", ) elif self.cluster_method == "auto-gmm": cluster = AutoGMMCluster( min_components=1, max_components=2, max_agglom_size=None ) elif self.cluster_method == "vmm": # cluster = VonMisesFisherMixture(n) pass else: raise ValueError(f"`cluster_method` must be one of {valid_methods}") cluster.fit(X) pred_labels = cluster.predict(X) self.pred_labels_ = pred_labels self.model_ = cluster if hasattr(cluster, "bic_"): bics = cluster.bic_ self.bics_ = bics bic_ratio = bics.loc[2].min() / bics.loc[1].min() self.bic_ratio_ = bic_ratio if cluster.n_components_ != 1: # recurse indicator = pred_labels == 0 self.X_children_ = (X[indicator, :], X[~indicator, :]) children = [] for i, X_child in enumerate(self.X_children_): child = DivisiveCluster( name=self.name + str(i), parent=self, min_split_samples=self.min_split_samples, n_init=self.n_init, cluster_method=self.cluster_method, ) child = child.fit(X_child) children.append(child) self.children = children return self
print("Finding pairwise jaccard distances") pdist_sparse = pairwise_sparse_jaccard_distance(path_mat) print(pdist_sparse.shape) print("Embedding with MDS") mds = ClassicalMDS(dissimilarity="precomputed") # mds = MDS(dissimilarity="precomputed", n_components=6, n_init=16, n_jobs=-2) jaccard_embedding = mds.fit_transform(pdist_sparse) # %% [markdown] # # print("Clustering embedding") agmm = AutoGMMCluster(min_components=10, max_components=40, affinity="euclidean", linkage="single") labels = agmm.fit_predict(jaccard_embedding) pairplot(jaccard_embedding, title="AGMM o CMDS o Jaccard o Sensorimotor Paths", labels=labels) savefig("AGMM-CMDS-jaccard-sm-path") print("Finding mean paths") mean_paths = [] uni_labels = np.unique(labels) for ul in uni_labels: inds = np.where(labels == ul)[0] paths = path_mat[inds, :] mean_path = np.array(np.mean(paths, axis=0))
palette=CLASS_COLOR_DICT, ) stashfig("raw-response-pairs" + basename) # %% [markdown] # # Cluster each thing separately from graspy.cluster import AutoGMMCluster pred = [] for i, name in enumerate(from_group_names): print(name) o = log_collapsed_hist[:, i * n_bins:(i + 1) * n_bins] agmm = AutoGMMCluster( min_components=2, max_components=20, n_jobs=-2, verbose=10, affinity=["euclidean", "manhattan", "none"], ) pred_labels = agmm.fit_predict(o) pairplot(o[:, :5], labels=meta["Merge Class"].values, palette=CLASS_COLOR_DICT) stashfig(f"from-g{name}-known" + basename) pairplot(o[:, :5], labels=pred_labels, palette=cc.glasbey_light) stashfig(f"from-g{name}-predicted" + basename) print(len(np.unique(pred_labels))) pred.append(pred_labels) print() # generate_cascade_paths(start_ind, probs, 1, stop_inds=out_inds, max_depth=10)
for i, (fg, fg_name) in enumerate(zip(from_groups, from_group_names)): print(f"Clustering for {fg_name}") # run the clustering on histogram hop_hist = fg_hop_hists[i] X = hop_hist.T if normalize: sums = X.sum(axis=1) sums[sums == 0] = 1 X = X / sums[:, None] if log_cluster: X = np.log10(X + 1) agmm = AutoGMMCluster(**cluster_kws) pred_labels = agmm.fit_predict(X) results = agmm.results_ fg_col_meta[i]["pred_labels"] = pred_labels fg_autoclusters.append(agmm) ggmm = GaussianCluster(min_components=10, max_components=40, n_init=20, covariance_type="diag") ggmm.fit(X) fg_graspyclusters.append(ggmm) gbics = ggmm.bic_ fig, ax = plt.subplots(1, 1, figsize=(10, 5))
pc.set_zorder(10) ax.plot(rng, cmds.singular_values_, "o-") ax.legend() stashfig("cmds-screeplot" + basename) # %% [markdown] # ## pairplot(path_embed, alpha=0.02) stashfig("cmds-pairs-all" + basename) # %% [markdown] # ## print("Running AGMM on CMDS embedding") n_components = 4 agmm = AutoGMMCluster(max_components=40, n_jobs=-2) pred = agmm.fit_predict(path_embed[:, :n_components]) print(f"Number of clusters: {agmm.n_components_}") # %% [markdown] # ## pairplot( path_embed[:, :n_components], alpha=0.02, labels=pred, palette=cc.glasbey_light, legend_name="Cluster", ) stashfig("pairplot-agmm-cmds" + basename)
screeplot(all_hop_hist, show_first=40) stashfig("scree-first-40") screeplot(all_hop_hist, show_first=None) stashfig("scree-all") screeplot(np.log10(all_hop_hist + 1), show_first=100) screeplot(np.log10(all_hop_hist + 1), show_first=100, cumulative=True) # %% [markdown] # ## from graspy.cluster import AutoGMMCluster agmm = AutoGMMCluster( min_components=2, max_components=50, affinity=["euclidean", "manhattan"], max_agglom_size=3000, n_jobs=-2, verbose=10, ) agmm.fit(all_hop_hist.T) # %% [markdown] # ## from graspy.embed import select_dimension select_dimension(all_hop_hist.T, n_elbows=5) #%% from graspy.embed import selectSVD from graspy.plot import pairplot
side_mb_mg = side_mgs[side] labels = side_mb_mg.meta["class1"].values labels = np.vectorize(label_map.get)(labels) plot_labels = side_mb_mg.meta["merge_class"].values # embed ase = AdjacencySpectralEmbed(n_components=None, algorithm="randomized") embed = ase.fit_transform(pass_to_ranks(side_mb_mg.adj)) embed = np.concatenate(embed, axis=1) # cluster using AutoGMM method = "AutoGMM" agmm = AutoGMMCluster( min_components=2, max_components=10, affinity=["euclidean", "manhattan", "cosine"], covariance_type=["full"], n_jobs=-1, ) agmm.fit(embed, labels) agmm_results = agmm.results_.copy() agmm_results.sort_values("bic/aic", inplace=True) agmm_model = agmm.model_ agmm_pred_labels = agmm_model.predict(embed) ari = adjusted_rand_score(labels, agmm_pred_labels) ari_no_kc = adjusted_rand_score( labels[labels != "KC"], agmm_pred_labels[labels != "KC"] ) row = dict( ari=ari, ari_no_kc=ari_no_kc,
dendrogram_ratio=(0, 0.2), ) ax = cg.ax_heatmap ax.axhline(max_hops, linewidth=2, linestyle="--", color="grey") ax.set_xticks([]) ax.set_yticks([]) cg.ax_row_colors.set_ylabel("Hops") stashfig("clustermap" + basename) # %% [markdown] # ## last thing, gmm agmm = AutoGMMCluster( min_components=10, max_components=30, affinity=["euclidean", "manhattan"], max_agglom_size=3000, n_jobs=-2, verbose=10, ) agmm.fit(log_hop_hist.T) # %% [markdown] # ## results = agmm.results_ best_inds = results.groupby("n_components")["bic/aic"].idxmin() best_results = results.loc[best_inds] sns.scatterplot(data=best_results, x="n_components", y="bic/aic") k = 25 best_results = best_results.set_index("n_components") model = best_results.loc[k, "model"] pred_labels = model.predict(log_hop_hist.T)
embedding = PCA(n_components=8).fit_transform(raw_hist_data) pairplot(embedding, labels=dfs[0]["Merge Class"].values, palette=CLASS_COLOR_DICT) # %% [markdown] # # from sklearn.cluster import AgglomerativeClustering agg = AgglomerativeClustering(n_clusters=10, affinity="euclidean", linkage="average") labels = agg.fit_predict(raw_hist_data) pairplot(embedding, labels=labels, palette=cc.glasbey_light) # %% [markdown] # # from graspy.cluster import AutoGMMCluster agm = AutoGMMCluster(min_components=2, max_components=20, n_jobs=-1 agm.fit(embedding) # %% [markdown] # # # agm.results_.groupby(["affinity", "covariance_type", "linkage"]) sns.scatterplot(data=agm.results_, x='n_components', y='bic/aic') # %% [markdown] # # new_groups = agm.predict(embedding) stacked_barplot(new_groups, meta["Merge Class"].values, color_dict=CLASS_COLOR_DICT) # %% [markdown] # #
else: fig, ax = plt.subplots(1, 1, figsize=(10, 10)) sns.scatterplot(path_embed[:, 0], path_embed[:, 1], s=30, alpha=0.2) ax.axis("off") stashfig(f"pairs-all" + basename) # %% [markdown] # ## Cluster and plot on the embedding print("Running AGMM on path embedding") n_components = elbows[0] # n_components = 2d print(f"Using {n_components} dimensions") agmm = AutoGMMCluster(max_components=30, n_jobs=-2) pred = agmm.fit_predict(path_embed[:, :n_components] ) # + np.random.normal(0, 0.01, size=path_embed.shape) print(f"Number of clusters: {agmm.n_components_}") pg = pairplot( path_embed[:, :n_components], alpha=0.1, labels=pred, palette=cc.glasbey_light, legend_name="Cluster", ) leg = pg._legend for lh in leg.legendHandles: lh.set_alpha(1)
heatmap( mg.adj, transform="simple-all", title=f"MB, threshold={threshold}", inner_hier_labels=true_labels, hier_label_fontsize=10, sort_nodes=True, ) latent = ase(mg.adj, n_components, ptr=ptr) # cluster = GaussianCluster( # min_components=2, max_components=10, covariance_type="all", n_init=100 # ) cluster = AutoGMMCluster(min_components=2, max_components=10) pred_labels = cluster.fit_predict(latent) ari = adjusted_rand_score(true_labels, pred_labels) row = {"ARI": ari, "Threshold": threshold, "Method": "GMMoASE"} rows.append(row) # do the MCMC block_series = run_minimize_blockmodel(mg, weight_model="discrete-poisson") ari = adjusted_rand_score(true_labels, block_series.values) row = {"ARI": ari, "Threshold": threshold, "Method": "GT-dp"} rows.append(row) # do the MCMC block_series = run_minimize_blockmodel(mg, weight_model=None) ari = adjusted_rand_score(true_labels, block_series.values) row = {"ARI": ari, "Threshold": threshold, "Method": "GT-None"}
) stashfig("agglomerative-path-dist-mat") # %% [markdown] # ## # %% [markdown] # ## # manifold = ClassicalMDS(n_components=2, dissimilarity="precomputed") manifold = TSNE(metric="precomputed") path_embed = manifold.fit_transform(path_dist_mat) path_embed = path_embed + np.random.normal(0, 0.01, size=path_embed.shape) agmm = AutoGMMCluster(max_components=5, n_jobs=-2) pred = agmm.fit_predict(path_embed) plot_df = pd.DataFrame(data=path_embed) plot_df["labels"] = pred fig, ax = plt.subplots(1, 1, figsize=(10, 10)) sns.scatterplot( data=plot_df, x=0, y=1, hue="labels", palette="Set1", # legend="full", ax=ax, s=20, linewidth=0.5,
cutoff = 8 base = f"-c{cutoff}-t{threshold}-{graph_type}" base_path = Path(f"./maggot_models/notebooks/outs/{run_name}/csvs") meta = pd.read_csv(base_path / str("meta" + base + ".csv"), index_col=0) path_mat = pd.read_csv(base_path / str("prob-path-mat" + base + ".csv"), index_col=0).values base_path = Path(f"./maggot_models/notebooks/outs/{embed_name}/csvs") embed_mat = pd.read_csv(base_path / str("euclid-mds-embed.csv"), index_col=0) gmm = AutoGMMCluster( min_components=10, max_components=50, affinity="all", linkage="all", covariance_type="all", n_jobs=-2, verbose=30, ) labels = gmm.fit_predict(embed_mat.values) label_df = pd.DataFrame(data=labels) stashcsv(label_df, "labels") print("Finding mean paths") mean_paths = [] uni_labels = np.unique(labels) for ul in uni_labels: inds = np.where(labels == ul)[0] paths = path_mat[inds, :]