示例#1
0
    def fit(self, X, y=None):
        n_samples = X.shape[0]

        if n_samples > self.min_split_samples:
            cluster = GaussianCluster(min_components=1,
                                      max_components=2,
                                      n_init=20)
            cluster.fit(X)
            self.model_ = cluster
        else:
            self.pred_labels_ = np.zeros(X.shape[0])
            self.left_ = None
            self.right_ = None
            self.model_ = None
            return self

        # recurse
        if cluster.n_components_ != 1:
            pred_labels = cluster.predict(X)
            self.pred_labels_ = pred_labels
            indicator = pred_labels == 0
            self.X_left_ = X[indicator, :]
            self.X_right_ = X[~indicator, :]
            split_left = PartitionCluster()
            self.left_ = split_left.fit(self.X_left_)

            split_right = PartitionCluster()
            self.right_ = split_right.fit(self.X_right_)
        else:
            self.pred_labels_ = np.zeros(X.shape[0])
            self.left_ = None
            self.right_ = None
            self.model_ = None
        return self
示例#2
0
 def fit(self, X, y=None):
     n_samples = X.shape[0]
     self.n_samples_ = n_samples
     self.cum_dist_ = 0
     if n_samples > self.min_split_samples:
         if self.cluster_method == "graspy-gmm":
             cluster = GaussianCluster(
                 min_components=1,
                 max_components=2,
                 n_init=self.n_init,
                 covariance_type="all",
             )
         elif self.cluster_method == "auto-gmm":
             cluster = AutoGMMCluster(
                 min_components=1, max_components=2, max_agglom_size=None
             )
         elif self.cluster_method == "vmm":
             # cluster = VonMisesFisherMixture(n)
             pass
         else:
             raise ValueError(f"`cluster_method` must be one of {valid_methods}")
         cluster.fit(X)
         pred_labels = cluster.predict(X)
         self.pred_labels_ = pred_labels
         self.model_ = cluster
         if hasattr(cluster, "bic_"):
             bics = cluster.bic_
             self.bics_ = bics
             bic_ratio = bics.loc[2].min() / bics.loc[1].min()
             self.bic_ratio_ = bic_ratio
         if cluster.n_components_ != 1:  # recurse
             indicator = pred_labels == 0
             self.X_children_ = (X[indicator, :], X[~indicator, :])
             children = []
             for i, X_child in enumerate(self.X_children_):
                 child = DivisiveCluster(
                     name=self.name + str(i),
                     parent=self,
                     min_split_samples=self.min_split_samples,
                     n_init=self.n_init,
                     cluster_method=self.cluster_method,
                 )
                 child = child.fit(X_child)
                 children.append(child)
             self.children = children
     return self
示例#3
0
 def fit(self, X, y=None):
     n_samples = X.shape[0]
     self.n_samples_ = n_samples
     if n_samples > self.min_split_samples:
         cluster = GaussianCluster(min_components=1,
                                   max_components=2,
                                   n_init=40)
         cluster.fit(X)
         pred_labels = cluster.predict(X)
         self.pred_labels_ = pred_labels
         self.model_ = cluster
         if cluster.n_components_ != 1:
             indicator = pred_labels == 0
             self.X_children_ = (X[indicator, :], X[~indicator, :])
             children = []
             for i, X_child in enumerate(self.X_children_):
                 child = DivisiveCluster(name=self.name + str(i),
                                         parent=self)
                 child = child.fit(X_child)
                 children.append(child)
             self.children = children
     return self
示例#4
0
cluster = "GMM"

lse_latent = lse(adj, 4, regularizer=None)

latent = lse_latent
pairplot(latent, labels=simple_class_labels, title=embed)

for k in range(MIN_CLUSTERS, MAX_CLUSTERS + 1):
    run_name = f"k = {k}, {cluster}, {embed}, right hemisphere (A to D), PTR, raw"
    print(run_name)
    print()

    # Cluster
    gmm = GaussianCluster(min_components=k, max_components=k, **gmm_params)
    gmm.fit(latent)
    pred_labels = gmm.predict(latent)

    # ARI
    base_dict = {
        "K": k,
        "Cluster": cluster,
        "Embed": embed,
        "Method": f"{cluster} o {embed}",
        "Score": gmm.model_.score(latent),
    }
    mb_ari = sub_ari(known_inds, mb_labels, pred_labels)
    mb_ari_dict = base_dict.copy()
    mb_ari_dict["ARI"] = mb_ari
    mb_ari_dict["Metric"] = "MB ARI"
    out_dicts.append(mb_ari_dict)
示例#5
0
def cluster_func(k, seed):
    np.random.seed(seed)
    run_name = f"k = {k}, {cluster}, {embed}, right hemisphere (A to D), PTR, raw"
    print(run_name)
    print()

    # Cluster
    gmm = GaussianCluster(min_components=k, max_components=k, **gmm_params)
    gmm.fit(latent)
    pred_labels = gmm.predict(latent)

    # ARI
    base_dict = {
        "K": k,
        "Cluster": cluster,
        "Embed": embed,
        "Method": f"{cluster} o {embed}",
        "Score": gmm.model_.score(latent),
    }
    mb_ari = sub_ari(known_inds, mb_labels, pred_labels)
    mb_ari_dict = base_dict.copy()
    mb_ari_dict["ARI"] = mb_ari
    mb_ari_dict["Metric"] = "MB ARI"
    out_dicts.append(mb_ari_dict)

    simple_ari = sub_ari(known_inds, simple_class_labels, pred_labels)
    simple_ari_dict = base_dict.copy()
    simple_ari_dict["ARI"] = simple_ari
    simple_ari_dict["Metric"] = "Simple ARI"
    out_dicts.append(simple_ari_dict)

    full_ari = adjusted_rand_score(class_labels, pred_labels)
    full_ari_dict = base_dict.copy()
    full_ari_dict["ARI"] = full_ari
    full_ari_dict["Metric"] = "Full ARI"
    out_dicts.append(full_ari_dict)

    save_name = f"k{k}-{cluster}-{embed}-right-ad-PTR-raw"

    # Plot embedding
    pairplot(latent, labels=pred_labels, title=run_name)
    # stashfig("latent-" + save_name)

    # Plot everything else
    prob_df = get_sbm_prob(adj, pred_labels)
    block_sum_df = get_block_edgesums(adj, pred_labels, prob_df.columns.values)

    clustergram(adj, latent, prob_df, block_sum_df, simple_class_labels,
                pred_labels)
    plt.suptitle(run_name, fontsize=40)
    stashfig("clustergram-" + save_name)

    # output skeletons
    _, colormap, pal = stashskel(save_name,
                                 skeleton_labels,
                                 pred_labels,
                                 palette="viridis",
                                 multiout=True)

    sns.set_context("talk")
    palplot(k, cmap="viridis")

    stashfig("palplot-" + save_name)

    # save dict colormapping
    filename = (Path("./maggot_models/notebooks/outs") / Path(FNAME) /
                str("colormap-" + save_name + ".json"))
    with open(filename, "w") as fout:
        json.dump(colormap, fout)

    stashskel(save_name,
              skeleton_labels,
              pred_labels,
              palette="viridis",
              multiout=False)
示例#6
0
def crossval_cluster(
    embed,
    left_inds,
    right_inds,
    R,
    min_clusters=2,
    max_clusters=15,
    n_init=25,
    left_pair_inds=None,
    right_pair_inds=None,
):
    left_embed = embed[left_inds]
    right_embed = embed[right_inds]
    print("Running left/right clustering with cross-validation\n")
    currtime = time.time()
    rows = []
    for k in tqdm(range(min_clusters, max_clusters)):
        # train left, test right
        # TODO add option for AutoGMM as well, might as well check
        left_gc = GaussianCluster(min_components=k,
                                  max_components=k,
                                  n_init=n_init)
        left_gc.fit(left_embed)
        model = left_gc.model_
        train_left_bic = model.bic(left_embed)
        train_left_lik = model.score(left_embed)
        test_left_bic = model.bic(right_embed @ R.T)
        test_left_lik = model.score(right_embed @ R.T)

        # train right, test left
        right_gc = GaussianCluster(min_components=k,
                                   max_components=k,
                                   n_init=n_init)
        right_gc.fit(right_embed)
        model = right_gc.model_
        train_right_bic = model.bic(right_embed)
        train_right_lik = model.score(right_embed)
        test_right_bic = model.bic(left_embed @ R)
        test_right_lik = model.score(left_embed @ R)

        left_row = {
            "k": k,
            "contra_bic": -test_left_bic,
            "contra_lik": test_left_lik,
            "ipsi_bic": -train_left_bic,
            "ipsi_lik": train_left_lik,
            "cluster": left_gc,
            "train": "left",
            "n_components": n_components,
        }
        right_row = {
            "k": k,
            "contra_bic": -test_right_bic,
            "contra_lik": test_right_lik,
            "ipsi_bic": -train_right_bic,
            "ipsi_lik": train_right_lik,
            "cluster": right_gc,
            "train": "right",
            "n_components": n_components,
        }

        # pairedness computation, if available
        if left_pair_inds is not None and right_pair_inds is not None:
            # TODO double check this is right
            pred_left = left_gc.predict(embed[left_pair_inds])
            pred_right = right_gc.predict(embed[right_pair_inds])
            pness, _, _ = compute_pairedness_bipartite(pred_left, pred_right)
            left_row["pairedness"] = pness
            right_row["pairedness"] = pness

            ari = adjusted_rand_score(pred_left, pred_right)
            left_row["ARI"] = ari
            right_row["ARI"] = ari

        rows.append(left_row)
        rows.append(right_row)

    results = pd.DataFrame(rows)
    print(f"{time.time() - currtime} elapsed")
    return results