Exemplo n.º 1
0
def get_clusters(clusterable_embedding_, min_cluster_size, min_samples):
    clusterer = hdbscan.HDBSCAN(
        min_samples=min_samples,
        min_cluster_size=min_cluster_size,
        prediction_data=True).fit(clusterable_embedding_)
    soft_clusters_ = hdbscan.all_points_membership_vectors(clusterer)
    return soft_clusters_
Exemplo n.º 2
0
def bsoid_hdbscan(umap_embeddings, hdbscan_params=HDBSCAN_PARAMS):
    """
    Trains HDBSCAN (unsupervised) given learned UMAP space
    :param umap_embeddings: 2D array, embedded UMAP space
    :param hdbscan_params: dict, HDBSCAN params in GLOBAL_CONFIG
    :return assignments: HDBSCAN assignments
    """
    highest_numulab = -np.infty
    numulab = []
    min_cluster_range = range(6, 21)
    logging.info('Running HDBSCAN on {} instances in {} D space...'.format(*umap_embeddings.shape))
    for min_c in min_cluster_range:
        trained_classifier = hdbscan.HDBSCAN(prediction_data=True,
                                             min_cluster_size=int(round(0.001 * min_c * umap_embeddings.shape[0])),
                                             **hdbscan_params).fit(umap_embeddings)
        numulab.append(len(np.unique(trained_classifier.labels_)))
        if numulab[-1] > highest_numulab:
            logging.info('Adjusting minimum cluster size to maximize cluster number...')
            highest_numulab = numulab[-1]
            best_clf = trained_classifier
    assignments = best_clf.labels_
    soft_clusters = hdbscan.all_points_membership_vectors(best_clf)
    soft_assignments = np.argmax(soft_clusters, axis=1)
    # trained_classifier = hdbscan.HDBSCAN(prediction_data=True,
    #                                      min_cluster_size=round(umap_embeddings.shape[0] * 0.007),  # just < 1%/cluster
    #                                      **hdbscan_params).fit(umap_embeddings)
    # assignments = best_clf.labels_
    logging.info('Done predicting labels for {} instances in {} D space...'.format(*umap_embeddings.shape))
    return assignments, soft_clusters, soft_assignments
Exemplo n.º 3
0
    def cluster(self, distances, metric='euclidean',
                allow_single_cluster=False, prediction_data=False, min_cluster_size=2):
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            ## Cluster on the UMAP embeddings and return soft clusters
            tuned_eom = utils.hyperparameter_selection(distances, self.threads,
                                                   metric=metric,
                                                   method="eom",
                                                   allow_single_cluster=allow_single_cluster,
                                                   starting_size = min_cluster_size)
            tuned_leaf = utils.hyperparameter_selection(distances, self.threads,
                                                   metric=metric,
                                                   method="leaf",
                                                   allow_single_cluster=allow_single_cluster,
                                                   starting_size = min_cluster_size)
            best_eom = utils.best_validity(tuned_eom)
            best_leaf = utils.best_validity(tuned_leaf)

            if int(best_eom["validity_score"]) >= int(best_leaf["validity_score"]):
                best = best_eom
                binning_method = "eom"
            else:
                best = best_leaf
                binning_method = "leaf"

            if metric == 'precomputed':
                clusterer = hdbscan.HDBSCAN(
                    algorithm='best',
                    alpha=1.0,
                    cluster_selection_method=binning_method,
                    metric=metric,
                    min_cluster_size=int(best['min_cluster_size']),
                    min_samples=int(best['min_samples']),
                    allow_single_cluster=allow_single_cluster,
                    core_dist_n_jobs=self.threads,
                    approx_min_span_tree=False
                )
                clusterer.fit(distances)
                if prediction_data:
                    self.soft_clusters = None

            else:
                clusterer = hdbscan.HDBSCAN(
                    algorithm='best',
                    alpha=1.0,
                    approx_min_span_tree=True,
                    gen_min_span_tree=True,
                    leaf_size=40,
                    cluster_selection_method=binning_method,
                    metric=metric,
                    min_cluster_size=int(best['min_cluster_size']),
                    min_samples=int(best['min_samples']),
                    allow_single_cluster=allow_single_cluster,
                    core_dist_n_jobs=self.threads,
                    prediction_data=prediction_data
                )
                clusterer.fit(distances)
                if prediction_data:
                    self.soft_clusters = hdbscan.all_points_membership_vectors(clusterer)
            return clusterer.labels_
Exemplo n.º 4
0
def auto_clust(sense_data: SenseData,
               umap_n_neighbors=2,
               umap_ndim=2,
               umap_min_dist=0.1,
               clust_min_samples=2,
               print_clust=False):
    vecs = sense_data.sense_vecs

    umap_inst = umap.UMAP(n_components=umap_ndim,
                          n_neighbors=umap_n_neighbors,
                          metric='cosine',
                          min_dist=umap_min_dist,
                          random_state=4422)
    proj = umap_inst.fit_transform(vecs)
    clust = hdbscan.HDBSCAN(min_cluster_size=2,
                            min_samples=clust_min_samples,
                            prediction_data=True).fit(proj)

    clabels = clust.labels_
    probs = clust.probabilities_
    sfreqs = np.array(sense_data.sense_freqs)
    slabels = sense_data.sense_labels

    if np.all(clabels < 0):
        prob_mat = np.array([])
        clust_freq = np.array([])
    else:
        prob_mat = hdbscan.all_points_membership_vectors(clust)
        clust_freq = (sfreqs[:, np.newaxis] * prob_mat).sum(0)

    sense_clusters = {}
    for clust_idx in np.unique(clabels):
        idx_list = (clabels == clust_idx).nonzero()[0]
        idx_list = sorted(idx_list, key=lambda x: -probs[x])

        sense_clusters[clust_idx] = [(i, probs[i], sfreqs[i], slabels[i])
                                     for i in idx_list]

        if print_clust:
            print("-- Cluster %d --" % (clust_idx, ))
            print("\n".join(f"[{x[0]:2d}] {x[1]:.2f}({x[2]:3d}): {x[3]}"
                            for x in sense_clusters[clust_idx]))
            print("\n")

    if print_clust:
        if proj.shape[1] == 1:
            plt.scatter(proj[:, 0],
                        np.ones(proj.shape[0]),
                        c=clust.labels_,
                        cmap="Set1")
        else:
            plt.scatter(proj[:, 0], proj[:, 1], c=clust.labels_, cmap="Set1")

    return {
        "projection": proj,
        "sense_clusters": sense_clusters,
        "sense_freqs": sfreqs,
        "cluster_freqs": clust_freq,
        "memberships": prob_mat
    }
Exemplo n.º 5
0
    def hierarchy(self):
        if st.button("__Identify Clusters__"):
            funfacts = randfacts.getFact()
            st.info(
                str.join('',
                         ('Identifying... Here is a random fact: ', funfacts)))
            max_num_clusters = -np.infty
            num_clusters = []
            self.min_cluster_size = np.linspace(self.cluster_range[0],
                                                self.cluster_range[1], 25)

            for min_c in self.min_cluster_size:
                learned_hierarchy = hdbscan.HDBSCAN(
                    prediction_data=True,
                    min_cluster_size=int(
                        round(min_c * 0.01 *
                              self.sampled_embeddings.shape[0])),
                    **HDBSCAN_PARAMS).fit(self.sampled_embeddings)
                num_clusters.append(len(np.unique(learned_hierarchy.labels_)))
                if num_clusters[-1] > max_num_clusters:
                    max_num_clusters = num_clusters[-1]
                    retained_hierarchy = learned_hierarchy
            self.assignments = retained_hierarchy.labels_
            self.assign_prob = hdbscan.all_points_membership_vectors(
                retained_hierarchy)
            self.soft_assignments = np.argmax(self.assign_prob, axis=1)
            st.info('Done assigning labels for **{}** instances ({} minutes) '
                    'in **{}** D space'.format(
                        self.assignments.shape,
                        round(self.assignments.shape[0] / 600),
                        self.sampled_embeddings.shape[1]))
            st.balloons()
Exemplo n.º 6
0
        def fit_transform(self,
                          dataset: Dataset,
                          name: str,
                          remove_disc: bool = True) -> TopicModel:
            # WARNING: setting a seed for reproducibility make the algorithm run on a single core (-> slower)
            seed = None
            if get_seed():
                seed = get_seed()
            # https://umap-learn.readthedocs.io/en/latest/index.html
            mapper = umap.UMAP(random_state=seed,
                               **self.u_args).fit(dataset.get_count_matrix())
            # WARNING: some points might be disconnected (np.inf)
            if remove_disc:
                disc = umap.utils.disconnected_vertices(mapper)
                embedding = mapper.embedding_[~disc, :]
            else:
                embedding = mapper.embedding_

            # https://hdbscan.readthedocs.io/en/latest/index.html
            clusterer = hdbscan.HDBSCAN(prediction_data=True,
                                        **self.h_args).fit(embedding)
            # labels = clusterer.labels_
            # predicted labels (hard clusters) with -1 for too noisy observations: how to return them?
            doc_topic_matrix = hdbscan.all_points_membership_vectors(clusterer)
            topic_word_matrix = np.array([])

            return TopicModel.from_array(name, topic_word_matrix,
                                         doc_topic_matrix)
Exemplo n.º 7
0
    def _cluster_embeddings(self,
                            umap_embeddings: np.ndarray,
                            documents: pd.DataFrame) -> Tuple[pd.DataFrame,
                                                              np.ndarray]:
        """ Cluster UMAP embeddings with HDBSCAN

        Arguments:
            umap_embeddings: The reduced sentence embeddings with UMAP
            documents: Dataframe with documents and their corresponding IDs

        Returns:
            documents: Updated dataframe with documents and their corresponding IDs
                       and newly added Topics
            probabilities: The distribution of probabilities
        """
        self.cluster_model = hdbscan.HDBSCAN(min_cluster_size=self.min_topic_size,
                                             metric='euclidean',
                                             cluster_selection_method='eom',
                                             prediction_data=True).fit(umap_embeddings)
        documents['Topic'] = self.cluster_model.labels_

        if self.calculate_probabilities:
            probabilities = hdbscan.all_points_membership_vectors(self.cluster_model)
        else:
            probabilities = None

        self._update_topic_size(documents)
        logger.info("Clustered UMAP embeddings with HDBSCAN")
        return documents, probabilities
def hdbscan_with_knn(data, clf, thresh=None, mink_p=1.5, mink_kwargs=None):
    df = data.copy()
    mc = clf.min_cluster_size
    ms = clf.min_samples
    metric = clf.metric
    clf_method = clf.cluster_selection_method

    try:
        # run hdbscan
        if metric == 'wminkowski':
            mw = mink_weights(df, **mink_kwargs)
            metric = lambda x, y: wminkowski(x, y, p=mink_p, w=mw)

        clusterer = HDBSCAN(min_cluster_size=mc,
                            min_samples=ms,
                            prediction_data=True,
                            metric=metric,
                            cluster_selection_method=clf_method).fit(df)

        thresh = thresh if thresh else 1 / max(2, len(clusterer.exemplars_))

        # get exemplars and labels
        exemplars = np.concatenate([e for e in clusterer.exemplars_])
        labels = np.concatenate([
            np.full((len(e)), fill_value=i)
            for i, e in enumerate(clusterer.exemplars_)
        ])

        # fit knn on exemplars
        knn = KNeighborsClassifier(n_neighbors=1).fit(exemplars, labels)

        # map top soft cluster probabilities to obs
        probs = np.max(all_points_membership_vectors(clusterer), axis=1)
        df['top_prob'] = pd.Series(probs, index=df.index)

        # assign all points to outlier class (label:-1)
        df['label'] = -1

        # take all points above a prob threshhold
        obs = df.top_prob >= thresh

        # predict labels from fitted knn
        df.loc[obs, 'label'] = knn.predict(
            df.loc[obs, df.columns.drop(['top_prob', 'label'])])
    except:
        df['label'] = 0
        return df.label


#----------------------- TO-DO -----------------------------
# allow batch prediction
# -- 1. assign points below thresh to outlier class
# -- 2. take top n% of obs by cluster prob and predict label
# -- 3. refit knn on assigned points
# -- 4. repeat steps 2 & 3 for remaining percentage bins

# allow for custom distance metrics and weight in hdbscan call
    return df.label
Exemplo n.º 9
0
def test_hdbscan_all_points_membership_vectors():
    clusterer = HDBSCAN(prediction_data=True).fit(X)
    vects = all_points_membership_vectors(clusterer)
    assert_array_almost_equal(
        vects[0], np.array([7.86400992e-002, 2.52734246e-001,
                            8.38299608e-002]))
    assert_array_almost_equal(
        vects[-1],
        np.array([8.09055344e-001, 8.35882503e-002, 1.07356406e-001]))
Exemplo n.º 10
0
	def make_clusters(self, min_size=11, metric='euclidean', use_soft_clustering=True):
		print("making clusters ..")
		self.use_soft_clustering = use_soft_clustering
		self.clusterer = hd.HDBSCAN(min_cluster_size=MIN_CLUSTER_SIZE, metric='euclidean',
			p=1, min_samples=1, cluster_selection_method='leaf', leaf_size=MIN_CLUSTER_SIZE*2,
			prediction_data=use_soft_clustering)
		result = self.clusterer.fit(self.features)
		if use_soft_clustering:
			self.soft_clusters = hd.all_points_membership_vectors(self.clusterer)
		print("finished making clusters ..")
Exemplo n.º 11
0
    def predict(self, dim_reduced_vecs, outlier_labels, scores, contamination,
                min_cluster_size, allow_noise):
        print("Clustering ...")
        clusterer = HDBSCAN(min_cluster_size=min_cluster_size,
                            prediction_data=True,
                            metric="euclidean").fit(dim_reduced_vecs)
        print("Get prediction data ...")
        clusterer.generate_prediction_data()

        try:
            cluster_pred = clusterer.labels_ if allow_noise else np.argmax(
                all_points_membership_vectors(clusterer)[:, 1:], axis=1)
        except IndexError:
            print(
                "Got IndexError and will not enforce cluster membership (allow noise) ..."
            )
            print(all_points_membership_vectors(clusterer))
            cluster_pred = clusterer.labels_

        # scoring
        print("Get scores ...")

        # GLOSH
        threshold = pd.Series(clusterer.outlier_scores_).quantile(0.9)
        outlier_pred = np.where(clusterer.outlier_scores_ > threshold, -1, 1)

        scores["cluster_n"] = len(np.unique(clusterer.labels_))
        scores["homogeneity"] = homogeneity_score(outlier_labels, cluster_pred)
        scores["completeness"] = completeness_score(outlier_labels,
                                                    cluster_pred)
        scores["v_measure"] = v_measure_score(outlier_labels, cluster_pred)

        scores = get_scores(scores, outlier_labels, outlier_pred)

        print(
            f"Homogeneity - {homogeneity_score(outlier_labels, cluster_pred)*100:.1f}  \
                cluster_n - {len(np.unique(clusterer.labels_))}")

        return scores, clusterer.outlier_scores_
Exemplo n.º 12
0
 def make_clusters(self):
     self.aggregates = []
     self.artists = []
     self.track_count = 0
     for _id in self.data.subdb:
         doc = self.data.subdb.get(_id)
         self.aggregates.append(doc["aggregates"]["mfcc"]["median"])
         self.artists.append({"name": doc["name"]})
         self.track_count += doc["track_count"]
     data = np.array(self.aggregates)
     self.clusterer = hd.HDBSCAN(min_cluster_size=3,
                                 metric='euclidean',
                                 p=1,
                                 min_samples=1,
                                 cluster_selection_method='leaf',
                                 leaf_size=5,
                                 prediction_data=True)
     result = self.clusterer.fit(data)
     self.soft_clusters = hd.all_points_membership_vectors(self.clusterer)
Exemplo n.º 13
0
def _set_cluster_member_colors(clusterer: HDBSCAN, soft: bool = True):
    n_clusters = np.size(np.unique(clusterer.labels_))

    if -1 in np.unique(clusterer.labels_) and not soft:
        color_palette = sns.color_palette('husl', n_clusters-1)
    else:
        color_palette = sns.color_palette('husl', n_clusters)

    if soft:
        soft_clusters = all_points_membership_vectors(clusterer)
        cluster_colors = [color_palette[np.argmax(x)]
                          for x in soft_clusters]
    else:
        cluster_colors = [color_palette[x] if x >= 0
                          else (0.5, 0.5, 0.5)
                          for x in clusterer.labels_]
    cluster_member_colors = [sns.desaturate(x, p)
                             for x, p
                             in zip(cluster_colors, clusterer.probabilities_)]
    return cluster_member_colors, color_palette
Exemplo n.º 14
0
def generate_groups(utterances, embeddings, metric='euclidean'):

    keys = ['text', 'intent', 'confidence']
    common_examples = []
    clusterer = hdbscan.HDBSCAN(
        metric=metric,
        min_cluster_size=5,
        min_samples=2,
        prediction_data=True,
        cluster_selection_method='eom',
        alpha=
        0.8  # TODO: The docs say this should be left alone, and keep the default of 1, but playing with it seems to help, might be different with real data.
    ).fit(np.inner(embeddings, embeddings))

    # create list like: [ [utterance, label] ] with strings
    labels_strings = list(map(str, clusterer.labels_))
    cluster_probs = hdbscan.all_points_membership_vectors(clusterer)
    values = zip(utterances, labels_strings, cluster_probs)
    for value in values:
        common_examples.append(dict(zip(keys, value)))

    message_groups = defaultdict(list)
    for example in common_examples:
        message_groups[example['intent']].append({
            "phrase":
            str(example['text']),
            # "confidence": list(example['confidence'])
        })

    unlabeled_messages = list(clusterer.labels_).count(-1)
    total_messages = len(utterances)
    return {
        "intents found": int(clusterer.labels_.max()),
        "unlabeled messages": int(unlabeled_messages),
        "labeled messaged": int(total_messages - unlabeled_messages),
        "total messages": int(total_messages),
        "message groups": message_groups
    }
Exemplo n.º 15
0
 def all_points_membership_vectors(self):
     return hdbscan.all_points_membership_vectors(self.hdbscan)
Exemplo n.º 16
0
def test_hdbscan_all_points_membership_vectors():
    clusterer = HDBSCAN(prediction_data=True, min_cluster_size=200).fit(X)
    vects = all_points_membership_vectors(clusterer)
    assert_array_equal(vects,
                       np.zeros(clusterer.prediction_data_.raw_data.shape[0]))
Exemplo n.º 17
0
    def cluster(self):
        """
        Cluster agents based on their traces.
        """
        if self.learner_params["Clustering"] == 'Grouped':
            if self.learner_params["Cluster_type"] == 'KMedoids':
                traces = None
                scheduling_profile = None

                if self.learner_params["Features"] == 'Normal':
                    # Clustering using the 11 standard features (reward, day of week, hour of the day, etc.)
                    traces, scheduling_profile = self.read_clustering_data()
                elif self.learner_params["Features"] == 'Advanced':
                    # Clustering using the derived features
                    traces, scheduling_profile = self.read_generated_clustering_data(
                    )

                dtw_days_matching_of_profiles = self.get_sorted_average_amount_activity_per_day_per_profile(
                    self.agent_profiles_params)
                distances = self.pre_calculate_distances(
                    traces,
                    scheduling_profile,
                    dtw_days_matching_of_profiles,
                    norm=False)

                K_Medoids = KMedoids()
                best_k = 0
                best_score = -1000000000
                best_clusters = None

                for k in range(2, min(6, self.number_agents - 1)):
                    clusters, curr_medoids = K_Medoids.cluster(
                        distances=distances, k=k)
                    silhouette_avg = silhouette_score(distances,
                                                      clusters,
                                                      metric="precomputed")
                    print(clusters)
                    print(
                        "__________________________________________________________________________"
                    )
                    print("For n_clusters =", k,
                          "The average silhouette_score is :", silhouette_avg)
                    print(
                        "__________________________________________________________________________"
                    )

                    if silhouette_avg > best_score:
                        best_score = silhouette_avg
                        best_clusters = clusters
                        best_k = k
                        print(
                            "__________________________________________________________________________"
                        )
                        print("Best K =", best_k,
                              "The best average silhouette_score is :",
                              best_score)
                        print(
                            "__________________________________________________________________________"
                        )

                print(best_clusters)
                self.number_clusters = best_k
                self.clusters = best_clusters
                self.assign_clusters_to_agents()

            elif self.learner_params[
                    "Cluster_type"] == 'AgglomerativeClustering':
                traces = None
                scheduling_profile = None
                if self.learner_params["Features"] == 'Normal':
                    # Clustering using the 11 standard features (reward, day of week, hour of the day, etc.)
                    traces, scheduling_profile = self.read_clustering_data()
                elif self.learner_params["Features"] == 'Advanced':
                    # Clustering using the derived features
                    traces, scheduling_profile = self.read_generated_clustering_data(
                    )
                dtw_days_matching_of_profiles = self.get_sorted_average_amount_activity_per_day_per_profile(
                    self.agent_profiles_params)
                # Clustering using hard clustering and precomputed distances
                distances = self.pre_calculate_distances(
                    traces,
                    scheduling_profile,
                    dtw_days_matching_of_profiles,
                    norm=False)
                best_k = 0
                best_score = -1000000000
                best_clusters = None

                for k in range(2, min(7, self.number_agents -
                                      1)):  # Add paramters in config
                    clusters = AgglomerativeClustering(
                        k, affinity='precomputed',
                        linkage='complete').fit_predict(distances)
                    silhouette_avg = silhouette_score(distances,
                                                      clusters,
                                                      metric="precomputed")
                    print(clusters)
                    print(
                        "__________________________________________________________________________"
                    )
                    print("For n_clusters =", k,
                          "The average silhouette_score is :", silhouette_avg)
                    print(
                        "__________________________________________________________________________"
                    )

                    if silhouette_avg > best_score:
                        best_score = silhouette_avg
                        best_clusters = clusters
                        best_k = k
                        print(
                            "__________________________________________________________________________"
                        )
                        print("Best K =", best_k,
                              "The best average silhouette_score is :",
                              best_score)
                        print(
                            "__________________________________________________________________________"
                        )

                print(best_clusters)
                self.number_clusters = best_k
                self.clusters = best_clusters
                self.assign_clusters_to_agents_hdbscan()

            elif self.learner_params["Cluster_type"] == 'HDBScan':
                traces = None
                scheduling_profile = None

                if self.learner_params["Features"] == 'Normal':
                    # Clustering using the 11 standard features (reward, day of week, hour of the day, etc.)
                    traces, scheduling_profile = self.read_clustering_data()
                elif self.learner_params["Features"] == 'Advanced':
                    # Clustering using the derived features
                    traces, scheduling_profile = self.read_generated_clustering_data(
                    )

                dtw_days_matching_of_profiles = self.get_sorted_average_amount_activity_per_day_per_profile(
                    self.agent_profiles_params)

                #clustering using hard clustering and precomputed distances
                #                distances = self.pre_calculate_distances(traces,
                #                                                         scheduling_profile,
                #                                                         dtw_days_matching_of_profiles,
                #                                                         norm=False)
                #                cluster_labels = hdbscan.HDBSCAN(min_cluster_size=5, metric='precomputed').fit_predict(distances)
                #                most_common = collections.Counter(cluster_labels).most_common(1)[0][0]
                #                for i in range(0, len(cluster_labels)):
                #                    if cluster_labels[i] == -1:
                #                        cluster_labels[i] = most_common

                #clustering with soft clustering to deal with outliers (can't use precomputed distances with this method)
                clusterer = hdbscan.HDBSCAN(min_cluster_size=5,
                                            prediction_data='true',
                                            metric='euclidean').fit(traces)
                soft_clusters = hdbscan.all_points_membership_vectors(
                    clusterer)
                cluster_labels = [np.argmax(x) for x in soft_clusters]
                print("CLUSTER LABELS" + str(cluster_labels))
                self.number_clusters = len(set(cluster_labels))
                self.clusters = cluster_labels
                self.assign_clusters_to_agents_hdbscan()
Exemplo n.º 18
0
    numulab = []
    min_cluster_range = np.linspace(cluster_range[0], cluster_range[1], 25)
    for min_c in min_cluster_range:
        trained_classifier = hdbscan.HDBSCAN(
            prediction_data=True,
            min_cluster_size=int(round(min_c * 0.01 *
                                       umap_embeddings.shape[0])),
            **HDBSCAN_PARAMS).fit(umap_embeddings)
        numulab.append(len(np.unique(trained_classifier.labels_)))
        if numulab[-1] > highest_numulab:
            st.info(
                'Adjusting minimum cluster size to maximize cluster number...')
            highest_numulab = numulab[-1]
            best_clf = trained_classifier
    assignments = best_clf.labels_
    soft_clusters = hdbscan.all_points_membership_vectors(best_clf)
    soft_assignments = np.argmax(soft_clusters, axis=1)
    st.info(
        'Done assigning labels for **{}** instances in **{}** D space'.format(
            *umap_embeddings.shape))
    with open(
            os.path.join(OUTPUT_PATH,
                         str.join('', (MODEL_NAME, '_clusters.sav'))),
            'wb') as f:
        joblib.dump([assignments, soft_clusters, soft_assignments], f)
    st.balloons()

if last_run:
    with open(
            os.path.join(OUTPUT_PATH,
                         str.join('', (MODEL_NAME, '_clusters.sav'))),
Exemplo n.º 19
0
               random_state=42).fit_transform(tfidf_vecs)
print("Local outlier factor ...")
# df["predicted"] = LocalOutlierFactor(
#    novelty=False, metric="euclidean", contamination=d["contamination"]).fit_predict(tfidf_vecs)
clusterer = HDBSCAN(min_cluster_size=10,
                    prediction_data=True).fit(dim_reduced_vecs)
threshold = pd.Series(clusterer.outlier_scores_).quantile(0.9)
df["predicted"] = np.where(clusterer.outlier_scores_ > threshold, -1, 1)

df["result"] = df.apply(lambda row: get_result(row), axis=1)

title = df["result"].value_counts().to_string().replace("\n", "\t")
title = f"m_clus: {clusterer.min_cluster_size} n_comp: {n_comps}" + title
print(classification_report(df["outlier_label"], df["predicted"]))
outlier_labels = df["outlier_label"]
print(all_points_membership_vectors(clusterer))
cluster_labels = clusterer.labels_ if allow_noise else np.argmax(
    all_points_membership_vectors(clusterer)[:, 1:], axis=1)
print(f"\nHomogeneity: {homogeneity_score(outlier_labels, cluster_labels)}")
crosstab = pd.crosstab(cluster_labels, outlier_labels, normalize='index')
print(f"\n\n {crosstab}")
crosstab_abs = pd.crosstab(cluster_labels, outlier_labels)
print(f"\n\n {crosstab_abs}")

if showclusters:
    df["result"] = cluster_labels.astype(str)
fig = create_show_graph(df, "text", coords_2d=vecs_2d, color="result")
fig.update_layout(title=title)
fig.show()

# !! get imdb % of each cluster and homogeneity score
Exemplo n.º 20
0
def run_streamlit_app():
    # Introduction
    st.title('B-SOiD')
    st.header('An open-source machine learning app for parsing spatio-temporal patterns.')
    st.subheader('Extract behavior from pose for any organism, any camera angle! '
                 'Note that keeping the checkboxes unchecked when not needed speeds up the processing.')

    demo_videos = {
        "Open-field, unrestrained, wild-type (Yttri lab @ CMU)":
            f"{os.path.join(BSOID_BASE_PROJECT_PATH, 'demo', 'ClusteredBehavior_aligned.mp4')}",
        "Open-field, tethered, OCD model (Ahmari lab @ UPitt)":
            f"{os.path.join(BSOID_BASE_PROJECT_PATH, 'demo', 'bsoid_grm_demo.mp4')}",
    }
    vid = st.selectbox("Notable examples, please contribute!", list(demo_videos.keys()), 0)
    with open(demo_videos[vid], 'rb') as video_file:
        # video_file = open(demo_vids[vid], 'rb')
        video_bytes = video_file.read()
        st.video(video_bytes)

    # Load previous run?
    if st.sidebar.checkbox("Load previous run? This resumes training, or can "
                           "load previously trained network for new analysis.", False):
        OUTPUT_PATH = st.sidebar.text_input('Enter the prior run output directory:')
        try:
            os.listdir(OUTPUT_PATH)
            st.markdown(f'You have selected **{OUTPUT_PATH}** as your prior run root directory.')
        except FileNotFoundError:
            st.error('No such directory')

        MODEL_NAME = st.sidebar.text_input('Enter your prior run variable file prefix:')
        if MODEL_NAME:
            st.markdown(f'You have selected **{MODEL_NAME}_[contents].sav** as your prior variable files.')

            app_model_data_filename = f'{MODEL_NAME}_data.sav'
            app_model_features_filename = f'{MODEL_NAME}_features.sav'
            app_model_predictions_filename = f'{MODEL_NAME}_predictions.sav'
            app_model_clusters_filename = f'{MODEL_NAME}_clusters.sav'
            app_model_neuralnet_filename = f'{MODEL_NAME}_neuralnet.sav'
        else:
            st.error('Please enter a prefix name for prior run variable file.')

        last_run = True
    else:
        last_run = False


    if not last_run:
        # # Setting things up # #
        # BASE_PATH, TRAIN_FOLDERS, FPS, OUTPUT_PATH and MODEL_NAME designations
        st.subheader('Find your data')
        st.write('The __BASE PATH__ contains multiple nested directories.')
        BASE_PATH = st.text_input('Enter a DLC project "BASE PATH":', DLC_PROJECT_PATH)
        try:
            os.listdir(BASE_PATH)
            st.markdown(
                f'You have selected **{BASE_PATH}** as your root directory for training/testing sub-directories.')
        except FileNotFoundError:
            st.error('No such directory')
        st.write('The __sub-directory(ies)__ each contain one or more .csv files. '
                 'Currently supporting _2D_ and _single_ animal.')
        TRAIN_FOLDERS = []
        num_project_path_sub_directories = int(st.number_input('How many BASE_PATH/SUB-DIRECTORIES for training?', value=3))
        st.markdown(f'Your will be training on **{num_project_path_sub_directories}** csv containing sub-directories.')
        for i in range(num_project_path_sub_directories):
            training_dir = st.text_input(f'Enter path to training directory NUMBER {i+1} within base path:')
            try:
                os.listdir(f'{BASE_PATH}{training_dir}')
            except FileNotFoundError:
                st.error('No such directory')
            if training_dir not in TRAIN_FOLDERS:
                TRAIN_FOLDERS.append(training_dir)
        st.markdown(f'You have selected **sub-directory(ies)** *{TRAIN_FOLDERS}*.')
        st.write('Average __frame-rate__ for these processed .csv files. '
                 'Your pose estimation will be integrated over 100ms. '
                 'For most animal behaviors, static poses per 100ms appears to capture _sufficient information_ '
                 'for behavioral clustering while maintaining _high temporal resolution._')
        FPS = int(st.number_input('What is your frame-rate?', value=60))
        st.markdown(f'Your framerate is **{FPS}** frames per second.')

        st.write('The __output directory__ will store B-SOID clustering _variable_ files and .csv _analyses_.')
        OUTPUT_PATH = st.text_input('Enter an output directory:', value=config.OUTPUT_PATH)

        try:
            os.listdir(OUTPUT_PATH)
            st.markdown(f'You have selected **{OUTPUT_PATH}** to store results.')
        except FileNotFoundError:
            st.error('No such directory, was there a typo or did you forget to create one?')

        st.write('For each run, computed variables are stored as __.sav files__. '
                 'If you type in the same variable prefix as last run, your _workspace_ will be loaded.')
        MODEL_NAME = st.text_input('Enter a variable file name prefix:')
        if MODEL_NAME:
            st.markdown(f'You have named **{MODEL_NAME}_XXX.sav** as the variable files.')
        else:
            st.error('Please enter a name for your variable file name prefix.')

        # Pre-processing
        st.subheader('__Pre-process__ the low-likelihood estimations as a representation of occlusion coordinates.')
        st.text_area('', '''
        Within each .csv file, the algorithm finds the best likelihood cutoff for each body part.
        ''')
        csv_rep = glob.glob(BASE_PATH + TRAIN_FOLDERS[0] + '/*.csv')
        # curr_df = pd.read_csv(csv_rep[0], low_memory=False)
        try:
            curr_df = pd.read_csv(csv_rep[0], low_memory=False)
        except IndexError as e: st.error('CSV file(s) was/were not found.')

        currdf = np.array(curr_df)
        BP = st.multiselect('Body parts to include', [*currdf[0, 1:-1:3]], [*currdf[0, 1:-1:3]])
        BODYPARTS = []
        for b in BP:
            index = [i for i, s in enumerate(currdf[0, 1:]) if b in s]
            if not index in BODYPARTS:
                BODYPARTS += index
        BODYPARTS.sort()
        if st.button("Start pre-processing"):
            filenames_list, rawdata_list, data_list, perc_rect_list = [], [], [], []
            for idx_folder, folder in enumerate(TRAIN_FOLDERS):  # Loop through folders
                f = io.get_filenames_csvs_from_folders_recursively_in_dlc_project_path(folder)
                my_bar = st.progress(0)
                for j, filename in enumerate(f):
                    curr_df = pd.read_csv(filename, low_memory=False)
                    curr_df_filt, perc_rect = feature_engineering.adaptive_filter_LEGACY(curr_df)
                    rawdata_list.append(curr_df)
                    perc_rect_list.append(perc_rect)
                    data_list.append(curr_df_filt)
                    filenames_list.append(filename)
                    my_bar.progress(round((j + 1) / len(f) * 100))
            training_data = np.array(data_list)
            with open(os.path.join(OUTPUT_PATH, app_model_data_filename), 'wb') as f:  # with open(os.path.join(OUTPUT_PATH, str.join('', (MODEL_NAME, '_data.sav'))), 'wb') as f:      f'{MODEL_NAME}_data.sav'
                joblib.dump([BASE_PATH, FPS, BODYPARTS, filenames_list, rawdata_list, training_data, perc_rect_list], f)

            st.info(f'Processed a total of **{len(data_list)}** CSV files, '
                    f'and compiled into a **{training_data.shape}** data list.')
            st.balloons()
        #
        with open(os.path.join(OUTPUT_PATH, app_model_data_filename), 'rb') as fr:  # f'{MODEL_NAME}_data.sav'
            BASE_PATH, FPS, BODYPARTS, filenames, rawdata_list, training_data, perc_rect_list = joblib.load(fr)
        if st.checkbox('Show % body part processed per file?', False):
            st.write('This line chart shows __% body part below file-based threshold__')
            subllh_percent = pd.DataFrame(perc_rect_list)
            st.bar_chart(subllh_percent)
        # st.write('This allows you to scroll through and visualize raw vs processed data.')
        # if st.checkbox("Show raw & processed data?", False):
        #     try:
        #         ID = int(st.number_input('Enter csv/data-list index:', min_value=1, max_value=len(rawdata_li), value=1))
        #         st.markdown('This is file *{}*.'.format(filenames[ID - 1]))
        #         st.write(rawdata_li[ID - 1])
        #         st.write(training_data[ID - 1])
        #     except:
        #         pass

    if last_run:
        with open(os.path.join(config.OUTPUT_PATH, app_model_data_filename), 'rb') as fr:
            BASE_PATH, FPS, BODYPARTS, filenames, rawdata_list, training_data, perc_rect_list = joblib.load(fr)
        if st.checkbox('Show % body part processed per file?', False):
            st.write('This line chart shows __% body part below file-based threshold__')
            subllh_percent = pd.DataFrame(perc_rect_list)
            st.bar_chart(subllh_percent)
        st.markdown(f'**_CHECK POINT_**: Processed a total of **{len(rawdata_list)}** CSV files, '
                    f'and compiled into a **{training_data.shape}** data list.')
        st.write('This allows you to scroll through and visualize raw vs processed data.')
        if st.checkbox("Show raw & processed data?", False):
            try:
                ID = int(
                    st.number_input('Enter csv/data-list index:', min_value=1, max_value=len(rawdata_list), value=1))
                st.write(rawdata_list[ID - 1])
                st.write(training_data[ID - 1])
            except Exception as e:  # TODO: med: exception is too generalized. Add note or make more specific.
                st.error(f'Error found: {repr(e)}.')
                pass



    # Feature extraction + UMAP
    st.subheader('Perform __dimensionality reduction__ to improve clustering.')
    st.text_area('', '''
    For each body part, find the distance to all others, the angular change between these distances, and its displacement over time. 
    That is A LOT of dimensions, so reducing it is necessary.
    ''')
    if st.button("Start dimensionality reduction"):

        # TODO ********************** THIS IS A TOTAL REPEAT OF ANOTHER FEATURE EXTRACTION FUNCTION ************************
        win_len = np.int(np.round(0.05 / (1 / FPS)) * 2 - 1)
        feats = []
        my_bar = st.progress(0)
        for m in range(len(training_data)):
            data_range = len(training_data[m])
            dis_r, dxy_r = [], []
            for r in range(data_range):
                if r < data_range - 1:
                    dis = []
                    for c in range(0, training_data[m].shape[1], 2):
                        dis.append(np.linalg.norm(training_data[m][r + 1, c:c + 2] - training_data[m][r, c:c + 2]))
                    dis_r.append(dis)
                dxy = []
                for i, j in itertools.combinations(range(0, training_data[m].shape[1], 2), 2):
                    dxy.append(training_data[m][r, i:i + 2] - training_data[m][r, j:j + 2])
                dxy_r.append(dxy)
            dis_r = np.array(dis_r)
            dxy_r = np.array(dxy_r)
            dis_smth = []
            dxy_eu = np.zeros([data_range, dxy_r.shape[1]])
            ang = np.zeros([data_range - 1, dxy_r.shape[1]])
            dxy_smth = []
            ang_smth = []
            for l in range(dis_r.shape[1]):
                dis_smth.append(likelihoodprocessing.boxcar_center(dis_r[:, l], win_len))
            for k in range(dxy_r.shape[1]):
                for kk in range(data_range):
                    dxy_eu[kk, k] = np.linalg.norm(dxy_r[kk, k, :])
                    if kk < data_range - 1:
                        b_3d = np.hstack([dxy_r[kk + 1, k, :], 0])
                        a_3d = np.hstack([dxy_r[kk, k, :], 0])
                        c = np.cross(b_3d, a_3d)
                        ang[kk, k] = np.dot(np.dot(np.sign(c[2]), 180) / np.pi,
                                            math.atan2(np.linalg.norm(c),
                                                       np.dot(dxy_r[kk, k, :], dxy_r[kk + 1, k, :])))
                dxy_smth.append(likelihoodprocessing.boxcar_center(dxy_eu[:, k], win_len))
                ang_smth.append(likelihoodprocessing.boxcar_center(ang[:, k], win_len))
            dis_smth = np.array(dis_smth)
            dxy_smth = np.array(dxy_smth)
            ang_smth = np.array(ang_smth)
            feats.append(np.vstack((dxy_smth[:, 1:], ang_smth, dis_smth)))
            my_bar.progress(round((m + 1) / len(training_data) * 100))
        st.info(f'Done extracting features from a total of **{len(training_data)}** training '
                f'CSV files. Now reducing dimensions...')
        for n in range(len(feats)):
            feats1 = np.zeros(len(training_data[n]))
            for k in range(round(FPS / 10), len(feats[n][0]), round(FPS / 10)):
                if k > round(FPS / 10):
                    feats1 = np.concatenate((feats1.reshape(feats1.shape[0], feats1.shape[1]),
                                             np.hstack((np.mean((feats[n][0:dxy_smth.shape[0],
                                                                 range(k - round(FPS / 10), k)]), axis=1),
                                                        np.sum((feats[n][dxy_smth.shape[0]:feats[n].shape[0],
                                                                range(k - round(FPS / 10), k)]),
                                                               axis=1))).reshape(len(feats[0]), 1)), axis=1)
                else:
                    feats1 = np.hstack((np.mean((feats[n][0:dxy_smth.shape[0], range(k - round(FPS / 10), k)]), axis=1),
                                        np.sum((feats[n][dxy_smth.shape[0]:feats[n].shape[0],
                                                range(k - round(FPS / 10), k)]), axis=1))).reshape(len(feats[0]), 1)
            if n > 0:
                features_10fps = np.concatenate((features_10fps, feats1), axis=1)
                scaler = StandardScaler()
                scaler.fit(feats1.T)
                feats1_scaled = scaler.transform(feats1.T).T
                features_10fps_scaled = np.concatenate((features_10fps_scaled, feats1_scaled), axis=1)
            else:
                features_10fps = feats1
                scaler = StandardScaler()
                scaler.fit(feats1.T)
                feats1_scaled = scaler.transform(feats1.T).T
                features_10fps_scaled = feats1_scaled  # scaling is important as I've seen wildly different stdev/feat between sessions
        features_10fps_train = features_10fps_scaled.T
        mem = virtual_memory()
        if mem.available > features_10fps_scaled.shape[0] * features_10fps_scaled.shape[1] * 32 * 100 + 256_000_000:  # TODO: low: magic variables
            trained_umap = umap.UMAP(**UMAP_PARAMS).fit(features_10fps_train)  # n_neighbors removed, moved to config
        else:
            st.info('Detecting that you are running low on available memory for this '
                    'computation, setting low_memory so will take longer.')
            trained_umap = umap.UMAP(low_memory=True, **UMAP_PARAMS).fit(features_10fps_train)
        umap_embeddings = trained_umap.embedding_
        st.info(f'Done non-linear transformation of **{features_10fps_train.shape[0]}** instances '
                f'from **{features_10fps_train.shape[1]}** D into **{umap_embeddings.shape[1]}** D.')
        with open(os.path.join(OUTPUT_PATH, app_model_features_filename), 'wb') as file:
            joblib.dump([features_10fps, features_10fps_scaled, umap_embeddings], file)
        st.balloons()

    if last_run:
        with open(os.path.join(OUTPUT_PATH, app_model_features_filename), 'rb') as fr:
            features_10fps, features_10fps_scaled, umap_embeddings = joblib.load(fr)
        st.markdown(f'**_CHECK POINT_**: Done non-linear transformation of **{features_10fps_scaled.shape[1]}** instances '
                    f'from **{features_10fps_scaled.shape[0]}** D into **{umap_embeddings.shape[1]}** D.')

    # HDBSCAN
    st.subheader('Perform density-based clustering.')
    st.text_area('', '''
    The following slider allows you to adjust cluster number.
    The preset (0.5-1.5%) works for most large (> 25k instances) datasets. 
    It is recommended to tweak this for cluster number > 40 or < 4.
    ''')
    cluster_range = st.slider('Select range of minimum cluster size in %', 0.01, 5.0, (0.4, 1.2))
    st.markdown(f'Your minimum cluster size ranges between **{cluster_range[0]}%** and **{cluster_range[1]}%**.')
    if st.button("Start clustering"):
        with open(os.path.join(OUTPUT_PATH, app_model_features_filename), 'rb') as fr:
            features_10fps, features_10fps_scaled, umap_embeddings = joblib.load(fr)
        highest_numulab = -np.infty
        numulab = []
        min_cluster_range = np.linspace(cluster_range[0], cluster_range[1], 25)
        for min_c in min_cluster_range:
            trained_classifier = hdbscan.HDBSCAN(prediction_data=True,
                                                 min_cluster_size=int(round(min_c * 0.01 * umap_embeddings.shape[0])),
                                                 **HDBSCAN_PARAMS).fit(umap_embeddings)
            numulab.append(len(np.unique(trained_classifier.labels_)))
            if numulab[-1] > highest_numulab:
                st.info('Adjusting minimum cluster size to maximize cluster number...')
                highest_numulab = numulab[-1]
                best_clf = trained_classifier
        assignments = best_clf.labels_  # TODO: med: potential for reference before assignment
        soft_clusters = hdbscan.all_points_membership_vectors(best_clf)
        soft_assignments = np.argmax(soft_clusters, axis=1)
        st.info('Done assigning labels for **{}** instances in **{}** D space'.format(*umap_embeddings.shape))
        with open(os.path.join(OUTPUT_PATH, app_model_clusters_filename), 'wb') as f:
            joblib.dump([assignments, soft_clusters, soft_assignments], f)
        st.balloons()

    if last_run:
        with open(os.path.join(OUTPUT_PATH, app_model_clusters_filename), 'rb') as fr:
            assignments, soft_clusters, soft_assignments = joblib.load(fr)
        st.markdown('**_CHECK POINT_**: Done assigning labels for '
                    '**{}** instances in **{}** D space'.format(*umap_embeddings.shape))

    if st.checkbox("Show UMAP enhanced clustering plot?", True):
        st.write('Below are two cluster plots.')
        with open(os.path.join(OUTPUT_PATH, app_model_features_filename), 'rb') as fr:  # str.join('', (MODEL_NAME, '_feats.sav'))
            features_10fps, features_10fps_scaled, umap_embeddings = joblib.load(fr)
        with open(os.path.join(OUTPUT_PATH, app_model_clusters_filename), 'rb') as fr:  # str.join('', (MODEL_NAME, '_clusters.sav'))
            assignments, soft_clusters, soft_assignments = joblib.load(fr)
        fig1, plt1 = visuals.plot_classes_bsoidapp(umap_embeddings[assignments >= 0], assignments[assignments >= 0])
        plt1.suptitle('HDBSCAN assignment')
        st.pyplot(fig1)
        st.write('The __soft__ assignment disregards noise and attempts to fit all data points to assignments '
                 'based on highest probability.')
        fig2, plt2 = visuals.plot_classes_bsoidapp(umap_embeddings[soft_assignments >= 0],
                                                   soft_assignments[soft_assignments >= 0])
        plt2.suptitle('HDBSCAN soft assignment')
        st.pyplot(fig2)

    st.subheader('Based on __soft__ assignment, train a neural network to _learn_ the rules.')
    st.text_area('', '''
    Neural network will be trained on recognizing distance, angles, and speed. 
    This is for our vision in closed-loop experiments
                 ''')
    if st.button("Start training a behavioral neural network"):
        with open(os.path.join(OUTPUT_PATH, app_model_features_filename), 'rb') as fr:  # with open(os.path.join(OUTPUT_PATH, str.join('', (MODEL_NAME, '_feats.sav'))), 'rb') as fr:  f'{MODEL_NAME}_feats.sav'
            features_10fps, features_10fps_scaled, umap_embeddings = joblib.load(fr)
        with open(os.path.join(OUTPUT_PATH, app_model_clusters_filename), 'rb') as fr:  # with open(os.path.join(OUTPUT_PATH, str.join('', (MODEL_NAME, '_clusters.sav'))), 'rb') as fr:  f'{MODEL_NAME}_clusters.sav'
            assignments, soft_clusters, soft_assignments = joblib.load(fr)

        features_10fps_train, feats_test, labels_train, labels_test = train_test_split(
            features_10fps.T, soft_assignments.T, test_size=HOLDOUT_PERCENT, random_state=config.RANDOM_STATE)
        st.info(
            f'Training feedforward neural network on randomly partitioned {(1 - HOLDOUT_PERCENT) * 100}% of training data...')
        classifier = MLPClassifier(**MLP_PARAMS)
        classifier.fit(features_10fps_train, labels_train)
        clf = MLPClassifier(**MLP_PARAMS)
        clf.fit(features_10fps.T, soft_assignments.T)
        nn_assignments = clf.predict(features_10fps.T)
        st.info(f'Done training feedforward neural network '
                f'mapping **{features_10fps.T.shape}** features to **{soft_assignments.T.shape}** assignments.')
        scores = cross_val_score(classifier, feats_test, labels_test, cv=CROSSVALIDATION_K, n_jobs=-1)
        with open(os.path.join(OUTPUT_PATH, app_model_neuralnet_filename), 'wb') as f:  # str.join('', (MODEL_NAME, '_neuralnet.sav'))
            joblib.dump([feats_test, labels_test, classifier, clf, scores, nn_assignments], f)
        st.balloons()

    if last_run:  # app_model_neuralnet_filename
        with open(os.path.join(OUTPUT_PATH, app_model_neuralnet_filename), 'rb') as fr:  # str.join('', (MODEL_NAME, '_neuralnet.sav'))
            feats_test, labels_test, classifier, clf, scores, nn_assignments = joblib.load(fr)
        st.markdown(f'**_CHECK POINT_**: Done training feedforward neural network '
                    f'mapping **{features_10fps.T.shape}** features to **{soft_assignments.T.shape}** assignments.')

    if st.checkbox(f"Show confusion matrix on {HOLDOUT_PERCENT * 100}% data?", False):
        with open(os.path.join(OUTPUT_PATH, app_model_neuralnet_filename), 'rb') as fr:  # f'{MODEL_NAME}_neuralnet.sav'
            feats_test, labels_test, classifier, clf, scores, nn_assignments = joblib.load(fr)
        np.set_printoptions(precision=2)  # TODO: low: move precision setting to top?
        titles_options = [("Non-normalized confusion matrix", None),
                          ("Normalized confusion matrix", 'true'), ]
        titlenames = ["counts", "norm"]  # TODO: unused variable
        j = 0  # TODO: unused variable
        st.write('Below are two confusion matrices - top: raw counts, bottom: probability. These matrices shows '
                 '**true positives in diagonal**, false negatives in rows, and false positives in columns')
        for title, normalize in titles_options:
            colormap = plot_confusion_matrix(classifier, feats_test, labels_test, cmap=plt.cm.Blues, normalize=normalize)
            colormap.ax_.set_title(title)
            j += 1
            st.pyplot(colormap.figure_)
        st.write(
            'If these are **NOT satisfactory**, either _increase_ the above minimum cluster size to '
            'remove noise subgroups, or include _more data_')
    if st.checkbox(
            "Show cross-validated accuracy on randomly selected {}% held-out test set?".format(HOLDOUT_PERCENT * 100),
            False):
        st.write(
            'For **overall** machine learning accuracy, a part of the error could be _cleaning up_ clustering noise.')
        with open(os.path.join(OUTPUT_PATH, app_model_neuralnet_filename), 'rb') as fr:
            feats_test, labels_test, classifier, clf, scores, nn_assignments = joblib.load(fr)
        fig, plt_acc = visuals.plot_accuracy_bsoidapp(scores)
        st.pyplot(fig)
        st.write(
            'If this is **NOT satisfactory**, either _increase_ the above minimum cluster size to '
            'remove noise subgroups, or include _more data_')

    st.subheader(f'If reasonable/satisfied, you may export analyses results to {OUTPUT_PATH}')
    txt5 = st.text_area('Result options descriptions:', '''
    Input features: basic statistics of these extracted pairwise distance, angle, and speed features. 
    Feature corresponding labels: these features time-locked to the labels. 
    Soft assignment probabilities: if interested, the label probabilities of each time point.
    ''')
    result1_options = st.multiselect('What type of results do you want to export',
                                     ['Input features', 'Feature corresponding labels',
                                      'Soft assignment probabilities', ],
                                     ['Feature corresponding labels', ], )
    if st.button('Export'):
        if any('Input features' in o for o in result1_options):
            with open(os.path.join(OUTPUT_PATH, app_model_features_filename), 'rb') as fr:  # str.join('', (MODEL_NAME, '_feats.sav'))
                features_10fps, features_10fps_scaled, umap_embeddings = joblib.load(fr)
            timestr = time.strftime("_%Y%m%d_%H%M")
            feat_range, feat_med, p_cts, edges = statistics.feat_dist(features_10fps)
            f_range_df = pd.DataFrame(feat_range, columns=['5%tile', '95%tile'])
            f_med_df = pd.DataFrame(feat_med, columns=['median'])
            f_pcts_df = pd.DataFrame(p_cts)
            f_pcts_df.columns = pd.MultiIndex.from_product([f_pcts_df.columns, ['prob']])
            f_edge_df = pd.DataFrame(edges)
            f_edge_df.columns = pd.MultiIndex.from_product([f_edge_df.columns, ['edge']])
            f_dist_data = pd.concat((f_range_df, f_med_df, f_pcts_df, f_edge_df), axis=1)
            f_dist_data.to_csv((os.path.join(OUTPUT_PATH, f'feature_distribution_10Hz{timestr}.csv')),  # str.join('', ('', timestr, '.csv'))
                index=True, chunksize=10000, encoding='utf-8')
        if any('Feature corresponding labels' in o for o in result1_options):
            with open(os.path.join(OUTPUT_PATH, app_model_features_filename), 'rb') as fr:
                features_10fps, features_10fps_scaled, umap_embeddings = joblib.load(fr)
            with open(os.path.join(OUTPUT_PATH, app_model_clusters_filename), 'rb') as fr:
                assignments, soft_clusters, soft_assignments = joblib.load(fr)
            with open(os.path.join(OUTPUT_PATH, app_model_neuralnet_filename), 'rb') as fr:
                feats_test, labels_test, classifier, clf, scores, nn_assignments = joblib.load(fr)
            timestr = time.strftime("_%Y%m%d_%H%M")
            length_nm, angle_nm, disp_nm = [], [], []
            for i, j in itertools.combinations(range(int(np.sqrt(features_10fps.shape[0]))), 2):  # TODO: low: remove range starts at 0, redundant?
                length_nm.append(['distance between points:', i + 1, j + 1])
                angle_nm.append(['angular change for points:', i + 1, j + 1])
            for i in range(int(np.sqrt(features_10fps.shape[0]))):
                disp_nm.append(['displacement for point:', i + 1, i + 1])
            m_columns = np.vstack((length_nm, angle_nm, disp_nm))
            feat_nm_df = pd.DataFrame(features_10fps.T, columns=m_columns)
            umaphdb_data = np.concatenate([umap_embeddings, assignments.reshape(len(assignments), 1),
                                           soft_assignments.reshape(len(soft_assignments), 1),
                                           nn_assignments.reshape(len(nn_assignments), 1), ], axis=1)
            multi_index_columns = pd.MultiIndex.from_tuples([
                ('UMAP embeddings', 'Dimension 1'),
                ('',                'Dimension 2'),
                ('',                'Dimension 3'),
                ('HDBSCAN',         'Assignment No.'),
                ('HDBSCAN*SOFT',    'Assignment No.'),
                ('Neural Net',      'Assignment No.')],
                names=['Type', 'Frame@10Hz'], )
            umaphdb_df = pd.DataFrame(umaphdb_data, columns=multi_index_columns)
            training_data = pd.concat((feat_nm_df, umaphdb_df), axis=1)
            soft_clust_prob = pd.DataFrame(soft_clusters)  # TODO: ??? !!!
            training_data.to_csv((os.path.join(OUTPUT_PATH, f'features_labels_10Hz{timestr}.csv')),  # str.join('', ('features_labels_10Hz', timestr, '.csv'))
                                 index=True, chunksize=10000, encoding='utf-8')
        if any('Soft assignment probabilities' in o for o in result1_options):
            with open(os.path.join(OUTPUT_PATH, app_model_clusters_filename), 'rb') as fr:
                assignments, soft_clusters, soft_assignments = joblib.load(fr)
            timestr = time.strftime("_%Y%m%d_%H%M")
            soft_clust_prob = pd.DataFrame(soft_clusters)
            soft_clust_prob.to_csv((os.path.join(OUTPUT_PATH, f'soft_cluster_prob_10Hz{timestr}.csv')),
                                   index=True, chunksize=10000, encoding='utf-8')
        st.balloons()

    if st.sidebar.checkbox('Behavioral structure visual analysis?', False):
        with open(os.path.join(OUTPUT_PATH, app_model_clusters_filename), 'rb') as fr:
            assignments, soft_clusters, soft_assignments = joblib.load(fr)
        with open(os.path.join(OUTPUT_PATH, app_model_predictions_filename), 'rb') as fr:  # str.join('', (MODEL_NAME, '_predictions.sav'))),
            folders, folders_list, filenames, data_new, frameshift_labels = joblib.load(fr)
        selected_folder = st.sidebar.selectbox('select folder', [*folders])
        try:
            indices = [i for i, s in enumerate(folders_list) if str(selected_folder) in s]
            tm_c_all, tm_p_all = [], []
            for idx in indices:
                df_runlengths, df_dur_statistics, B, df_tm, B_norm = statistics.main_app(
                    frameshift_labels[idx], len(np.unique(soft_assignments)))
                tm_c_all.append(B)
                tm_p_all.append(B_norm)
            tm_c_ave = np.nanmean(tm_c_all, axis=0)
            tm_p_ave = np.nanmean(tm_p_all, axis=0)
            diag = [tm_c_ave[i][i] for i in range(len(tm_c_ave))]
            diag_p = np.array(diag) / np.array(diag).max()
            node_sizes = [50 * i for i in diag_p]
            A = np.matrix(tm_p_ave)  # TODO: med: numpy error: the matrix subclass is not the recommended way to represent matrices or deal with linear algebra (see https://docs.scipy.org/doc/numpy/user/numpy-for-matlab-users.html). Please adjust your code to use regular ndarray.
            np.fill_diagonal(A, 0)
            A_norm = A / A.sum(axis=1)
            where_are_NaNs = np.isnan(A_norm)
            A_norm[where_are_NaNs] = 0
            fig = plt.figure()
            G = nx.from_numpy_matrix(A_norm, create_using=nx.MultiDiGraph())
            pos = nx.layout.spring_layout(G)
            edge_colors = [G[u][v][0].get('weight') for u, v in G.edges()]
            nodes = nx.draw_networkx_nodes(G, pos, node_size=node_sizes, node_color='blue', with_label=True)
            edges = nx.draw_networkx_edges(G, pos, node_size=node_sizes, arrowstyle='->',
                                           arrowsize=8, edge_color=edge_colors, edge_cmap=plt.cm.Blues, width=1.5)
            lab_pos = [pos[i] + 0.005 for i in range(len(pos))]
            nx.draw_networkx_labels(G, lab_pos, font_size=10)
            pc = mpl.collections.PatchCollection(edges, cmap=plt.cm.Blues)
            pc.set_array(edge_colors)
            plt.colorbar(pc)
            ax = plt.gca()
            ax.set_axis_off()
            st.pyplot(fig)
        except:
            pass

    else:
        st.subheader('Making sense of these behaviors and bulk process old/new data.')
        txt = st.text_area('Process flow options:', '''
        Generate predictions and corresponding videos: allows you to go video by video and analyze with visuals.
        Bulk process all csvs: once you have subjective definitions for labels, you can run predictions with high consistency. It will prompt for types of analysis to be exported.
        ''')

        prediction_options = st.selectbox('Select an option:',
                                          ('Generate predictions and corresponding videos', 'Bulk process all csvs'))
        if prediction_options == 'Generate predictions and corresponding videos':
            csv_dir = st.text_input('Enter the testing data sub-directory within BASE PATH:')
            try:
                os.listdir(os.path.join(DLC_PROJECT_PATH, csv_dir))  # os.listdir(str.join('', (BASE_PATH, csv_dir)))
                st.markdown(f'You have selected **{csv_dir}** as your csv data sub-directory.')
            except FileNotFoundError:
                st.error('No such directory')
            csv_file = st.selectbox('Select the csv file', sorted(os.listdir(DLC_PROJECT_PATH + csv_dir)))
            vid_dir = st.text_input('Enter corresponding video directory (This can be outside of BASE PATH):')
            try:
                os.listdir(vid_dir)
                st.markdown(f'You have selected **{vid_dir}** as your video directory.')
            except FileNotFoundError:
                st.error('No such directory')
            vid_file = st.selectbox('Select the video (.mp4 or .avi)', sorted(os.listdir(vid_dir)))
            st.markdown(f'You have selected **{vid_file}** as your video matching **{csv_file}**.')
            csv_filename = os.path.basename(csv_file).rpartition('.')[0]
            try:
                os.mkdir(str.join('', (DLC_PROJECT_PATH, csv_dir, '/pngs')))  # TODO: low: refactor `str.join(...)`
            except FileExistsError:
                pass
            try:
                path_to_make = os.path.join(DLC_PROJECT_PATH, csv_dir, 'pngs', csv_filename)  # path_to_make = f'{DLC_PROJECT_PATH}{csv_dir}{os.path.sep}pngs{os.path.sep}{csv_filename}'
                if not os.path.isdir(path_to_make):
                    os.mkdir(path_to_make)  # os.mkdir(str.join('', (DLC_PROJECT_PATH, csv_dir, '/pngs', '/', csv_filename)))

            except FileExistsError as fee:
                err = f'Error: {repr(fee)}'
                logger.error(err)
                pass
            frame_dir = os.path.join(DLC_PROJECT_PATH, csv_dir, 'pngs', csv_filename)  # frame_dir = f'{DLC_PROJECT_PATH}{csv_dir}{os.path.sep}pngs{os.path.sep}{csv_filename}'  # TODO: low: refactor `str.join(...)`
            st.markdown(f'You have created **{frame_dir}** as your PNG directory for video file {vid_file}.')
            probe = ffmpeg.probe(os.path.join(vid_dir, vid_file))
            video_info = next(s for s in probe['streams'] if s['codec_type'] == 'video')
            width = int(video_info['width'])
            height = int(video_info['height'])
            num_frames = int(video_info['nb_frames'])
            bit_rate = int(video_info['bit_rate'])
            avg_frame_rate = round(int(
                video_info['avg_frame_rate'].rpartition('/')[0]) / int(video_info['avg_frame_rate'].rpartition('/')[2]))
            if st.button(f'Start frame extraction for {num_frames} frames at {avg_frame_rate} frames per second'):
                try:
                    (ffmpeg.input(os.path.join(vid_dir, vid_file))
                     .filter('fps', fps=avg_frame_rate)
                     .output(str.join('', (frame_dir, '/frame%01d.png')), video_bitrate=bit_rate,
                             s=str.join('', (str(int(width * 0.5)), 'x', str(int(height * 0.5)))), sws_flags='bilinear',
                             start_number=0)
                     .run(capture_stdout=True, capture_stderr=True))
                    st.info(f'Done extracting **{num_frames}** frames from video **{vid_file}**.')
                except ffmpeg.Error as e:
                    print('stdout:', e.stdout.decode('utf8'))
                    print('stderr:', e.stderr.decode('utf8'))
            try:
                os.mkdir(str.join('', (DLC_PROJECT_PATH, csv_dir, '/mp4s')))
            except FileExistsError:
                pass
            try:
                os.mkdir(str.join('', (DLC_PROJECT_PATH, csv_dir, '/mp4s', '/', csv_filename)))
            except FileExistsError:
                pass
            shortvid_dir = str.join('', (DLC_PROJECT_PATH, csv_dir, '/mp4s', '/', csv_filename))
            st.markdown(f'You have created **{shortvid_dir}** as your .mp4 directory for '
                        f'group examples from video {vid_file}.')
            min_time = st.number_input('Enter minimum time for bout in ms:', value=100)
            min_frames = round(float(min_time) * 0.001 * float(FPS))
            st.markdown(f'You have entered **{min_time} ms** as your minimum duration per bout, '
                        f'which is equivalent to **{min_frames} frames**.'
                        f'(drop this down for more group representations)')
            number_examples = st.slider('Select number of non-repeated examples', 1, 10, 3)
            st.markdown('Your will obtain a maximum of **{number_examples}** non-repeated output examples per group.')
            out_fps = int(st.number_input('Enter output frame-rate:', value=30))
            playback_speed = float(out_fps) / float(FPS)
            st.markdown(f'Your have selected to view these examples at **{out_fps} FPS**, which is '
                        f'equivalent to **{playback_speed}X speed**.')
            if st.button("Predict labels and create example videos"):
                with open(os.path.join(OUTPUT_PATH, app_model_neuralnet_filename), 'rb') as fr:
                    feats_test, labels_test, classifier, clf, scores, nn_assignments = joblib.load(fr)
                curr_df = pd.read_csv(os.path.join(str.join('', (DLC_PROJECT_PATH, csv_dir, '/', csv_file))),
                                      low_memory=False)  # TODO: low: rework this pathing...
                curr_df_filt, perc_rect = likelihoodprocessing.adaptive_filter_LEGACY(curr_df)
                test_data = [curr_df_filt]
                labels_frameshift = []
                fs_labels = frameshift_labels = []
                # TODO: low: change loop variable names so each is unique (clarity)
                for i in range(len(test_data)):
                    feats_new = classify.bsoid_extract_app(test_data, FPS)
                    labels = classify.bsoid_predict_app(feats_new, clf)
                    for m in range(len(labels)):
                        labels[m] = labels[m][::-1]
                    labels_pad = -1 * np.ones([len(labels), len(max(labels, key=lambda x: len(x)))])
                    for n, l in enumerate(labels):
                        labels_pad[n][0:len(l)] = l
                        labels_pad[n] = labels_pad[n][::-1]
                        if n > 0:
                            labels_pad[n][0:n] = labels_pad[n - 1][0:n]
                    labels_frameshift.append(labels_pad.astype(int))
                for k in range(len(labels_frameshift)):
                    labels_fs2 = []
                    for l in range(math.floor(FPS / 10)):
                        labels_fs2.append(labels_frameshift[k][l])
                    frameshift_labels.append(np.array(labels_fs2).flatten('F'))
                st.info(f'Done frameshift-predicting **{csv_file}**.')
                # def create_labeled_vid_app(labels, crit, counts, output_fps, video_frames_directory, output_path) -> None:
                videoprocessing.create_labeled_example_videos_by_label(
                    frameshift_labels[0], crit=int(min_frames), counts=int(number_examples),  # TODO: high: why is only the first frameshift_labels indexed?
                    output_fps=int(out_fps), video_frames_directory=frame_dir, output_path=shortvid_dir)
                st.balloons()
            if st.checkbox(f"Show example videos? (loading it up from {shortvid_dir})", False):
                example_vid = st.selectbox('Select the video (.mp4 or .avi)', sorted(os.listdir(shortvid_dir)))
                example_vid_file = open(os.path.join(str.join('', (shortvid_dir, os.path.sep, example_vid))), 'rb')
                st.markdown(f'You have selected **{example_vid}** as your video from {shortvid_dir}.')
                video_bytes = example_vid_file.read()
                st.video(video_bytes)

        if prediction_options == 'Bulk process all CSVs':
            st.write('Bulk processing will take some time for large datasets.'
                     'This includes a lot of files, long videos, and/or high frame-rates.')
            TEST_FOLDERS = []
            num_project_path_sub_directories: int = int(st.number_input('How many sub-directories for bulk predictions?', value=3))
            st.markdown(f'Your will be predicting on **{num_project_path_sub_directories}** csv containing sub-directories.')
            for i in range(num_project_path_sub_directories):
                test_dir = st.text_input(f'Enter path to test directory number {i+1} within base path:')
                try:
                    os.listdir(str.join('', (DLC_PROJECT_PATH, test_dir)))
                    os.listdir(f'{DLC_PROJECT_PATH}{test_dir}')
                except FileNotFoundError:
                    st.error('No such directory')
                except Exception as e:
                    err = f'Unexpected error found: {repr(e)}'
                    st.error(err)
                    logger.error(err)
                if test_dir not in TEST_FOLDERS:
                    TEST_FOLDERS.append(test_dir)
            st.markdown(f'You have selected sub-directory(ies) **{TEST_FOLDERS}**.')
            FPS = int(st.number_input('What is your framerate for these csvs?', value=60))  # TODO: Q: 60=magic variable?
            st.markdown(f'Your frame-rate is **{FPS}** frames per second for these CSVs.')
            st.text_area('Select the analysis of interest to you. If in doubt, select all.', '''
            Predicted labels with original pose: labels written into original .csv files (time-locked).
            Behavioral bout lengths in chronological order: the behaviors and its bouts over time. 
            Behavioral bout statistics: basic statistics for these behavioral durations. 
            Transition matrix: behavioral transitions based on Markov Decision Process.
            ''')
            result2_options = st.multiselect('What type of results do you want to export?',
                                             ['Predicted labels with original pose',
                                              'Behavioral bout lengths in chronological order',
                                              'Behavioral bout statistics', 'Transition matrix'],
                                             ['Predicted labels with original pose', 'Behavioral bout statistics'])
            if st.button("Begin bulk csv processing, potentially a long computation"):
                st.write('These B-SOiD csv files will be saved in the original pose estimation csv containing '
                         'folders, under sub-directory BSOID.')
                with open(os.path.join(OUTPUT_PATH, app_model_neuralnet_filename), 'rb') as fr:
                    feats_test, labels_test, classifier, clf, scores, nn_assignments = joblib.load(fr)
                folders, filenames, data_new, perc_rect = io.import_folders_app(DLC_PROJECT_PATH, TEST_FOLDERS, BODYPARTS)
                labels_frameshift, labels_fs2, frameshift_labels = [], [], []  # TODO: HIGH: RE-EVALUATE VARIABLE NAMES --> `labels_fs` and `fs_labels` <-------
                bar = st.progress(0)
                for i in range(len(data_new)):
                    feats_new = classify.bsoid_extract_app([data_new[i]], FPS)
                    labels = classify.bsoid_predict_app(feats_new, clf)
                    for m in range(0, len(labels)):
                        labels[m] = labels[m][::-1]
                    labels_pad = -1 * np.ones([len(labels), len(max(labels, key=lambda x: len(x)))])
                    for n, l in enumerate(labels):
                        labels_pad[n][0:len(l)] = l
                        labels_pad[n] = labels_pad[n][::-1]
                        if n > 0:
                            labels_pad[n][0:n] = labels_pad[n - 1][0:n]
                    labels_frameshift.append(labels_pad.astype(int))
                    bar.progress(round((i + 1) / len(data_new) * 100))
                for k in range(len(labels_frameshift)):
                    labels_fs2 = []
                    for l in range(math.floor(FPS / 10)):
                        labels_fs2.append(labels_frameshift[k][l])
                    frameshift_labels.append(np.array(labels_fs2).flatten('F'))
                st.info(f'Done frameshift-predicting a total of **{len(data_new)}** files.')
                filenames = []
                all_df = []
                folders_list = []
                for i, folder in enumerate(TEST_FOLDERS):  # Loop through folders
                    f = io.get_filenames_csvs_from_folders_recursively_in_dlc_project_path(folder)
                    for j, filename in enumerate(f):
                        curr_df = pd.read_csv(filename, low_memory=False)
                        filenames.append(filename)
                        folders_list.append(folder)
                        all_df.append(curr_df)
                for i in range(len(frameshift_labels)):
                    timestr = time.strftime("_%Y%m%d_%H%M_")
                    csv_filename = os.path.basename(filenames[i]).rpartition('.')[0]
                    fs_labels_pad = np.pad(frameshift_labels[i], (0, len(all_df[i]) - 2 - len(frameshift_labels[i])), 'edge')
                    df2 = pd.DataFrame(fs_labels_pad, columns={'B-SOiD labels'})
                    df2.loc[len(df2)] = ''
                    df2.loc[len(df2)] = ''  # TODO: low: duplicate?
                    df2 = df2.shift()
                    df2.loc[0] = ''
                    df2 = df2.shift()
                    df2.loc[0] = ''
                    frames = [df2, all_df[0]]
                    xyfs_df = pd.concat(frames, axis=1)

                    df_runlengths, df_dur_statistics, B, df_tm, B_norm = \
                        statistics.main_app(frameshift_labels[i], len(np.unique(nn_assignments)))
                    try:
                        os.mkdir(str.join('', (DLC_PROJECT_PATH, folders_list[i], '/BSOID')))
                    except FileExistsError:
                        pass
                    if any('Predicted labels with original pose' in o for o in result2_options):
                        xyfs_filename = os.path.join(f'{DLC_PROJECT_PATH}{folders_list[i]}', 'BSOID', f'labels_pose_{FPS}Hz{timestr}{csv_filename}.csv')  # xyfs_filename = os.path.join(DLC_PROJECT_PATH + folders_list[i] + '/BSOID', str.join('', ('', str(FPS), 'Hz', timestr, csv_filename, '.csv')))
                        xyfs_df.to_csv(xyfs_filename, index=True, chunksize=10000, encoding='utf-8')
                    if any('Behavioral bout lengths in chronological order' in o for o in result2_options):
                        df_runlengths.to_csv(os.path.join(
                            str.join('', (DLC_PROJECT_PATH, folders_list[i], '/BSOID')),
                            str.join('', ('bout_lengths_', str(FPS), 'Hz', timestr, csv_filename, '.csv'))),
                            index=True, chunksize=10000, encoding='utf-8')
                    if any('Behavioral bout statistics' in o for o in result2_options):
                        df_dur_statistics.to_csv(os.path.join(
                            str.join('', (DLC_PROJECT_PATH, folders_list[i], '/BSOID')),
                            str.join('', ('bout_stats_', str(FPS), 'Hz', timestr, csv_filename, '.csv'))),
                            index=True, chunksize=10000, encoding='utf-8')
                    if any('Transition matrix' in o for o in result2_options):
                        df_tm.to_csv(os.path.join(
                            str.join('', (DLC_PROJECT_PATH, folders_list[i], '/BSOID')),
                            str.join('', ('transitions_mat_', str(FPS), 'Hz', timestr, csv_filename, '.csv'))),
                            index=True, chunksize=10000, encoding='utf-8')
                with open(os.path.join(OUTPUT_PATH, app_model_predictions_filename), 'wb') as f:
                    joblib.dump([folders, folders_list, filenames, data_new, frameshift_labels], f)
                st.balloons()

    return