def test_discrete_clusters():
    def homogeneity_test(estimates, truth, threshold=0.95):
        for x in np.unique(estimates.values):
            if x == -1:
            inds = np.where(estimates.values == x)[0]
            if len(inds) > 0:
                zeros = truth.iloc[inds].values == 0
                ones = truth.iloc[inds].values == 1

                assert ((np.sum(zeros) / len(inds)) >= threshold) or ((np.sum(ones) / len(inds)) >= threshold)

    models = ['AffinityPropagation', 'AgglomerativeClustering', 'Birch', 'DBSCAN', 'OPTICS', 'FeatureAgglomeration',
              'KMeans', 'MiniBatchKMeans', 'MeanShift', 'SpectralClustering']

    for m in models:
        labels = hyp.cluster(clusters, model=m)
        homogeneity_test(labels, true_labels)

        labels2 = hyp.cluster([cluster1, cluster2], model=m)
        homogeneity_test(labels2[0], true_labels.iloc[:cluster1.shape[0]])
        homogeneity_test(labels2[1], true_labels.iloc[cluster1.shape[0]:])
def cluster(
):  #x should be a dataframe with 1 row per video and 1 column per timepoint/topic -- e.g. the result of np.ravel(x0.values).T
    clustered_labels = hyp.cluster(x, cluster='KMeans', n_clusters=5)

    clusters = []
    for k in np.unique(clustered_labels):
        inds = np.where(
            clustered_labels == k
        )[0]  #might need to change clustered_labels to np.array(clustered_labels) in this line
    return clusters, clustered_labels  #clusters[0] is a number-of-cluster_0-videos by timepoints*topics dataframe; clusters[0].iloc[0] is the reshpaed trajectory from the first video from the first cluster (a 1 by timepoints*topics matrix)
def test_cluster_mixture():
    n_components = 3
    mode = 'fit_predict_proba'
    models = ['GaussianMixture', 'BayesianGaussianMixture']

    for m in models:
        next_model = {'model': m, 'args': [], 'kwargs': {'n_components': n_components}}
        mixture_proportions = hyp.cluster(clusters, model=next_model, mode=mode)

        assert mixture_proportions.shape == (clusters.shape[0], 3)
        assert np.all(np.sum(np.abs(mixture_proportions), axis=0) > 0)
        assert np.all(mixture_proportions >= 0)
        assert np.all(mixture_proportions <= 1)
        assert np.allclose(np.sum(mixture_proportions, axis=1), 1)
# -*- coding: utf-8 -*-
Using the cluster function to label clusters

Here is an example where we generate some synthetic data, and then use the
cluster function to get cluster labels, which we can then pass to the `group`
kwarg to color our points by cluster.

# Code source: Andrew Heusser
# License: MIT

# import
import hypertools as hyp
import numpy as np
from scipy.stats import multivariate_normal

# simulate clusters
cluster1 = np.random.multivariate_normal(np.zeros(3), np.eye(3), size=100)
cluster2 = np.random.multivariate_normal(np.zeros(3) + 3, np.eye(3), size=100)
data = np.vstack([cluster1, cluster2])

# get cluster labels
cluster_labels = hyp.cluster(data, n_clusters=2)

# plot
hyp.plot(data, '.', group=cluster_labels)
geo = hyp.plot(data, '.', reduce='FastICA')
geo = hyp.plot(data, '.', reduce='FactorAnalysis')
geo = hyp.plot(data, '.', reduce='TruncatedSVD')  #same results like PCA
geo = hyp.plot(data, '.',
               reduce='DictionaryLearning')  #took a long time to run
geo = hyp.plot(data, '.', reduce='MiniBatchDictionaryLearning')
geo = hyp.plot(data, '.', reduce='TSNE')  #takes long time to run
geo = hyp.plot(data, '.', reduce='Isomap')  #memory error
geo = hyp.plot(data, '.', reduce='SpectralEmbedding')  #system hangs
geo = hyp.plot(data, '.', reduce='LocallyLinearEmbedding')
geo = hyp.plot(data, '.', reduce='MDS')  #memory error

geo = hyp.plot(data, '.', reduce={'model': 'PCA', 'params': {'whiten': True}})

training_set = data.iloc[np.random.choice(len(data), 10000), :]
birch = hyp.cluster(training_set, cluster='Birch')
all_birch = birch.apply(data)

geo_cluster = hyp.plot(training_set, '.', cluster='HDBSCAN', n_clusters=6)


geo_cluster = hyp.plot(data, '.', n_clusters=6)
geo_cluster = hyp.plot(data, '.', cluster='KMeans', n_clusters=8)
geo_cluster = hyp.plot(data, '.', cluster='MiniBatchKMeans', n_clusters=8)
geo_cluster = hyp.plot(data,
                       n_clusters=8)  #memory error
geo_cluster = hyp.plot(
    data, '.', cluster='Birch', n_clusters=8