def fit(self, X, y=None): n_samples = X.shape[0] if n_samples > self.min_split_samples: cluster = GaussianCluster(min_components=1, max_components=2, n_init=20) cluster.fit(X) self.model_ = cluster else: self.pred_labels_ = np.zeros(X.shape[0]) self.left_ = None self.right_ = None self.model_ = None return self # recurse if cluster.n_components_ != 1: pred_labels = cluster.predict(X) self.pred_labels_ = pred_labels indicator = pred_labels == 0 self.X_left_ = X[indicator, :] self.X_right_ = X[~indicator, :] split_left = PartitionCluster() self.left_ = split_left.fit(self.X_left_) split_right = PartitionCluster() self.right_ = split_right.fit(self.X_right_) else: self.pred_labels_ = np.zeros(X.shape[0]) self.left_ = None self.right_ = None self.model_ = None return self
def fit(self, X, y=None): n_samples = X.shape[0] self.n_samples_ = n_samples self.cum_dist_ = 0 if n_samples > self.min_split_samples: if self.cluster_method == "graspy-gmm": cluster = GaussianCluster( min_components=1, max_components=2, n_init=self.n_init, covariance_type="all", ) elif self.cluster_method == "auto-gmm": cluster = AutoGMMCluster( min_components=1, max_components=2, max_agglom_size=None ) elif self.cluster_method == "vmm": # cluster = VonMisesFisherMixture(n) pass else: raise ValueError(f"`cluster_method` must be one of {valid_methods}") cluster.fit(X) pred_labels = cluster.predict(X) self.pred_labels_ = pred_labels self.model_ = cluster if hasattr(cluster, "bic_"): bics = cluster.bic_ self.bics_ = bics bic_ratio = bics.loc[2].min() / bics.loc[1].min() self.bic_ratio_ = bic_ratio if cluster.n_components_ != 1: # recurse indicator = pred_labels == 0 self.X_children_ = (X[indicator, :], X[~indicator, :]) children = [] for i, X_child in enumerate(self.X_children_): child = DivisiveCluster( name=self.name + str(i), parent=self, min_split_samples=self.min_split_samples, n_init=self.n_init, cluster_method=self.cluster_method, ) child = child.fit(X_child) children.append(child) self.children = children return self
def fit(self, X, y=None): n_samples = X.shape[0] self.n_samples_ = n_samples if n_samples > self.min_split_samples: cluster = GaussianCluster(min_components=1, max_components=2, n_init=40) cluster.fit(X) pred_labels = cluster.predict(X) self.pred_labels_ = pred_labels self.model_ = cluster if cluster.n_components_ != 1: indicator = pred_labels == 0 self.X_children_ = (X[indicator, :], X[~indicator, :]) children = [] for i, X_child in enumerate(self.X_children_): child = DivisiveCluster(name=self.name + str(i), parent=self) child = child.fit(X_child) children.append(child) self.children = children return self
cluster = "GMM" lse_latent = lse(adj, 4, regularizer=None) latent = lse_latent pairplot(latent, labels=simple_class_labels, title=embed) for k in range(MIN_CLUSTERS, MAX_CLUSTERS + 1): run_name = f"k = {k}, {cluster}, {embed}, right hemisphere (A to D), PTR, raw" print(run_name) print() # Cluster gmm = GaussianCluster(min_components=k, max_components=k, **gmm_params) gmm.fit(latent) pred_labels = gmm.predict(latent) # ARI base_dict = { "K": k, "Cluster": cluster, "Embed": embed, "Method": f"{cluster} o {embed}", "Score": gmm.model_.score(latent), } mb_ari = sub_ari(known_inds, mb_labels, pred_labels) mb_ari_dict = base_dict.copy() mb_ari_dict["ARI"] = mb_ari mb_ari_dict["Metric"] = "MB ARI" out_dicts.append(mb_ari_dict)
def cluster_func(k, seed): np.random.seed(seed) run_name = f"k = {k}, {cluster}, {embed}, right hemisphere (A to D), PTR, raw" print(run_name) print() # Cluster gmm = GaussianCluster(min_components=k, max_components=k, **gmm_params) gmm.fit(latent) pred_labels = gmm.predict(latent) # ARI base_dict = { "K": k, "Cluster": cluster, "Embed": embed, "Method": f"{cluster} o {embed}", "Score": gmm.model_.score(latent), } mb_ari = sub_ari(known_inds, mb_labels, pred_labels) mb_ari_dict = base_dict.copy() mb_ari_dict["ARI"] = mb_ari mb_ari_dict["Metric"] = "MB ARI" out_dicts.append(mb_ari_dict) simple_ari = sub_ari(known_inds, simple_class_labels, pred_labels) simple_ari_dict = base_dict.copy() simple_ari_dict["ARI"] = simple_ari simple_ari_dict["Metric"] = "Simple ARI" out_dicts.append(simple_ari_dict) full_ari = adjusted_rand_score(class_labels, pred_labels) full_ari_dict = base_dict.copy() full_ari_dict["ARI"] = full_ari full_ari_dict["Metric"] = "Full ARI" out_dicts.append(full_ari_dict) save_name = f"k{k}-{cluster}-{embed}-right-ad-PTR-raw" # Plot embedding pairplot(latent, labels=pred_labels, title=run_name) # stashfig("latent-" + save_name) # Plot everything else prob_df = get_sbm_prob(adj, pred_labels) block_sum_df = get_block_edgesums(adj, pred_labels, prob_df.columns.values) clustergram(adj, latent, prob_df, block_sum_df, simple_class_labels, pred_labels) plt.suptitle(run_name, fontsize=40) stashfig("clustergram-" + save_name) # output skeletons _, colormap, pal = stashskel(save_name, skeleton_labels, pred_labels, palette="viridis", multiout=True) sns.set_context("talk") palplot(k, cmap="viridis") stashfig("palplot-" + save_name) # save dict colormapping filename = (Path("./maggot_models/notebooks/outs") / Path(FNAME) / str("colormap-" + save_name + ".json")) with open(filename, "w") as fout: json.dump(colormap, fout) stashskel(save_name, skeleton_labels, pred_labels, palette="viridis", multiout=False)
def crossval_cluster( embed, left_inds, right_inds, R, min_clusters=2, max_clusters=15, n_init=25, left_pair_inds=None, right_pair_inds=None, ): left_embed = embed[left_inds] right_embed = embed[right_inds] print("Running left/right clustering with cross-validation\n") currtime = time.time() rows = [] for k in tqdm(range(min_clusters, max_clusters)): # train left, test right # TODO add option for AutoGMM as well, might as well check left_gc = GaussianCluster(min_components=k, max_components=k, n_init=n_init) left_gc.fit(left_embed) model = left_gc.model_ train_left_bic = model.bic(left_embed) train_left_lik = model.score(left_embed) test_left_bic = model.bic(right_embed @ R.T) test_left_lik = model.score(right_embed @ R.T) # train right, test left right_gc = GaussianCluster(min_components=k, max_components=k, n_init=n_init) right_gc.fit(right_embed) model = right_gc.model_ train_right_bic = model.bic(right_embed) train_right_lik = model.score(right_embed) test_right_bic = model.bic(left_embed @ R) test_right_lik = model.score(left_embed @ R) left_row = { "k": k, "contra_bic": -test_left_bic, "contra_lik": test_left_lik, "ipsi_bic": -train_left_bic, "ipsi_lik": train_left_lik, "cluster": left_gc, "train": "left", "n_components": n_components, } right_row = { "k": k, "contra_bic": -test_right_bic, "contra_lik": test_right_lik, "ipsi_bic": -train_right_bic, "ipsi_lik": train_right_lik, "cluster": right_gc, "train": "right", "n_components": n_components, } # pairedness computation, if available if left_pair_inds is not None and right_pair_inds is not None: # TODO double check this is right pred_left = left_gc.predict(embed[left_pair_inds]) pred_right = right_gc.predict(embed[right_pair_inds]) pness, _, _ = compute_pairedness_bipartite(pred_left, pred_right) left_row["pairedness"] = pness right_row["pairedness"] = pness ari = adjusted_rand_score(pred_left, pred_right) left_row["ARI"] = ari right_row["ARI"] = ari rows.append(left_row) rows.append(right_row) results = pd.DataFrame(rows) print(f"{time.time() - currtime} elapsed") return results