def test_discrete_clusters(): def homogeneity_test(estimates, truth, threshold=0.95): for x in np.unique(estimates.values): if x == -1: continue inds = np.where(estimates.values == x)[0] if len(inds) > 0: zeros = truth.iloc[inds].values == 0 ones = truth.iloc[inds].values == 1 assert ((np.sum(zeros) / len(inds)) >= threshold) or ((np.sum(ones) / len(inds)) >= threshold) models = ['AffinityPropagation', 'AgglomerativeClustering', 'Birch', 'DBSCAN', 'OPTICS', 'FeatureAgglomeration', 'KMeans', 'MiniBatchKMeans', 'MeanShift', 'SpectralClustering'] for m in models: labels = hyp.cluster(clusters, model=m) homogeneity_test(labels, true_labels) labels2 = hyp.cluster([cluster1, cluster2], model=m) homogeneity_test(labels2[0], true_labels.iloc[:cluster1.shape[0]]) homogeneity_test(labels2[1], true_labels.iloc[cluster1.shape[0]:])
def cluster( x, n_clusters=5 ): #x should be a dataframe with 1 row per video and 1 column per timepoint/topic -- e.g. the result of np.ravel(x0.values).T clustered_labels = hyp.cluster(x, cluster='KMeans', n_clusters=5) clusters = [] for k in np.unique(clustered_labels): inds = np.where( clustered_labels == k )[0] #might need to change clustered_labels to np.array(clustered_labels) in this line clusters.append(x.iloc[inds].copy()) return clusters, clustered_labels #clusters[0] is a number-of-cluster_0-videos by timepoints*topics dataframe; clusters[0].iloc[0] is the reshpaed trajectory from the first video from the first cluster (a 1 by timepoints*topics matrix)
def test_cluster_mixture(): n_components = 3 mode = 'fit_predict_proba' models = ['GaussianMixture', 'BayesianGaussianMixture'] for m in models: next_model = {'model': m, 'args': [], 'kwargs': {'n_components': n_components}} mixture_proportions = hyp.cluster(clusters, model=next_model, mode=mode) assert mixture_proportions.shape == (clusters.shape[0], 3) assert np.all(np.sum(np.abs(mixture_proportions), axis=0) > 0) assert np.all(mixture_proportions >= 0) assert np.all(mixture_proportions <= 1) assert np.allclose(np.sum(mixture_proportions, axis=1), 1)
# -*- coding: utf-8 -*- """ ============================= Using the cluster function to label clusters ============================= Here is an example where we generate some synthetic data, and then use the cluster function to get cluster labels, which we can then pass to the `group` kwarg to color our points by cluster. """ # Code source: Andrew Heusser # License: MIT # import import hypertools as hyp import numpy as np from scipy.stats import multivariate_normal # simulate clusters cluster1 = np.random.multivariate_normal(np.zeros(3), np.eye(3), size=100) cluster2 = np.random.multivariate_normal(np.zeros(3) + 3, np.eye(3), size=100) data = np.vstack([cluster1, cluster2]) # get cluster labels cluster_labels = hyp.cluster(data, n_clusters=2) # plot hyp.plot(data, '.', group=cluster_labels)
geo = hyp.plot(data, '.', reduce='FastICA') geo = hyp.plot(data, '.', reduce='FactorAnalysis') geo = hyp.plot(data, '.', reduce='TruncatedSVD') #same results like PCA geo = hyp.plot(data, '.', reduce='DictionaryLearning') #took a long time to run geo = hyp.plot(data, '.', reduce='MiniBatchDictionaryLearning') geo = hyp.plot(data, '.', reduce='TSNE') #takes long time to run geo = hyp.plot(data, '.', reduce='Isomap') #memory error geo = hyp.plot(data, '.', reduce='SpectralEmbedding') #system hangs geo = hyp.plot(data, '.', reduce='LocallyLinearEmbedding') geo = hyp.plot(data, '.', reduce='MDS') #memory error geo = hyp.plot(data, '.', reduce={'model': 'PCA', 'params': {'whiten': True}}) training_set = data.iloc[np.random.choice(len(data), 10000), :] birch = hyp.cluster(training_set, cluster='Birch') all_birch = birch.apply(data) geo_cluster = hyp.plot(training_set, '.', cluster='HDBSCAN', n_clusters=6) #Clustering geo_cluster = hyp.plot(data, '.', n_clusters=6) geo_cluster = hyp.plot(data, '.', cluster='KMeans', n_clusters=8) geo_cluster = hyp.plot(data, '.', cluster='MiniBatchKMeans', n_clusters=8) geo_cluster = hyp.plot(data, '.', cluster='AgglomerativeClustering', n_clusters=8) #memory error geo_cluster = hyp.plot( data, '.', cluster='Birch', n_clusters=8