Пример #1
0
    def test_clustering_namingTestRequireUnique(self):
        c = oe.cluster(self.data)
        c.cluster('parent', 'kmeans', 'kmeans', K=2)
        self.assertEqual(1, len(c.labels))

        c.cluster('parent', 'kmeans', 'kmeans', Require_Unique=1, K=2)
        self.assertEqual(1, len(c.labels))
Пример #2
0
	def mixture_model(self, K=2, iterations=10):
		"""
		Finishing Technique to assemble a final, hard parition of the data according to maximizing the likelihood according to the
		observed clustering solutions across the ensemble. This will operate on all clustering solutions contained in the container cluster class.
		Operates on entire ensemble of clustering solutions in self, to create a mixture model
		See finishing.mixture_model for more details. 

		Parameters
		----------
		K: int
			number of clusters to create. Default K=2
		iterations: int
			number of iterations of EM algorithm to perform. Default iterations=10
	   
		Returns
		-------
		c: openensembles clustering object
			a new clustering object with c.labels['mixture_model'] set to the final solution. 

		Raises
		------
			ValueError:
				If there are not at least two clustering solutions

		References
		----------
		Topchy, Jain, and Punch, "A mixture model for clustering ensembles Proc. SIAM Int. Conf. Data Mining (2004)"
		
		Examples
		--------
		>>> cMM = c.mixture_model(4, 10)
		>>> d.plot_data('parent', cluster_labels=cMM.labels['mixture_model'])

		"""
		params = {}
		params['iterations'] = iterations
		params['K'] = K

		#check to make sure more than one solution exists in ensemble
		if len(self.params) < 2:
			raise ValueError("Mixture Model is a finsihing technique for an ensemble, the cluster object must contain more than one solution")
		N = self.dataObj.D['parent'].shape[0]
		parg = []
		for solution in self.labels:
			parg.append(self.labels[solution])

		mixtureObj = finish.mixture_model(parg, N, nEnsCluster=K, iterations=iterations)
		mixtureObj.emProcess()
		c = oe.cluster(self.dataObj)
		name = 'mixture_model'
		c.labels[name] = mixtureObj.labels
		c.data_source[name] = 'parent'
		c.clusterNumbers[name] = np.unique(c.labels[name])
		c.params[name] = params
		c.algorithms[name] = 'mixture_model'
		return c
Пример #3
0
 def test_ReplicateValidation(self):
     c = oe.cluster(self.data)
     c.cluster('parent', 'kmeans', 'kmeans', K=2)
     v = oe.validation(self.data, c)
     len_expected = 0
     self.assertEqual(len_expected, len(v.validation))
     v.calculate('Ball_Hall', 'kmeans', 'parent')
     len_expected = 1
     self.assertEqual(len_expected, len(v.validation))
     v.calculate('Ball_Hall', 'kmeans', 'parent')
     self.assertEqual(len_expected, len(v.validation))
Пример #4
0
 def test_validation_badSourceAndCluster(self):
     c = oe.cluster(self.data)
     c.cluster('parent', 'kmeans', 'kmeans', K=2)
     v = oe.validation(self.data, c)
     self.assertRaises(
         ValueError,
         lambda: v.calculate('Ball_Hall', 'kmeans', 'gobblygook'))
     self.assertRaises(
         ValueError,
         lambda: v.calculate('Ball_Hall', 'gobblygook', 'parent'))
     self.assertRaises(
         ValueError, lambda: v.calculate('GobblyGook', 'kmeans', 'parent'))
Пример #5
0
 def test_allValidationMetrics(self):
     c = oe.cluster(self.data)
     c.cluster('parent', 'kmeans', 'kmeans', K=2)
     v = oe.validation(self.data, c)
     FCN_DICT = v.validation_metrics_available()
     len_expected = 1
     for validation_name in FCN_DICT:
         v.calculate(validation_name, 'kmeans', 'parent')
         self.assertEqual(len_expected, len(v.validation))
         self.assertEqual(len_expected, len(v.description))
         self.assertEqual(len_expected, len(v.source_name))
         self.assertEqual(len_expected, len(v.cluster_name))
         len_expected += 1
Пример #6
0
    def finish_co_occ_linkage(self, threshold, linkage='average'):
        """
		The finishing technique that calculates a co-occurrence matrix on all cluster solutions in the ensemble and 
		then hierarchically clusters the co-occurrence, treating it as a similarity matrix. The clusters are defined by 
		the threshold of the distance used to cut. 



		Parameters
		----------
		threshold: float
			Linkage distance to use as a cutoff to create partitions
		linkage: string
			Linkage type. See `scipy.cluster.hierarchy <https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html#scipy.cluster.hierarchy.linkage>`_

		Returns
		-------
		c: openensembles clustering object
			a new clustering object with c.labels['co_occ_linkage'] set to the final solution. 

		Examples
		--------
		To determine where the cut is visually, at threshold=0.5:

		>>> coMat = c.co_occurrence()
		>>> coMat.plot(threshold=0.5, linkage='ward')

		To create the cut at threshold=0.5 

		>>> cWard = c.co_occ_linkage(0.5, 'ward')
		>>> d.plot_data('parent', cluster_labels=cWard.labels['co_occ_linkage'])


		"""
        params = {}
        params['linkage'] = linkage
        params['threshold'] = threshold
        coMatObj = self.co_occurrence_matrix('parent')
        coL = finish.co_occurrence_linkage(coMatObj,
                                           threshold,
                                           linkage=linkage)
        coL.finish()
        c = oe.cluster(self.dataObj)
        name = 'co_occ_linkage'
        c.labels[name] = coL.labels
        c.params[name] = params
        c.data_source[name] = 'parent'
        c.clusterNumbers[name] = np.unique(c.labels[name])
        c.algorithms[name] = 'co_occ_linkage'
        return c
Пример #7
0
    def test_cluster_merge(self):
        c = oe.cluster(self.data)
        c2 = oe.cluster(self.data)
        c.cluster('parent', 'kmeans', 'kmeans', K=2)
        c2.cluster('parent', 'kmeans', 'kmeans', K=2)

        self.assertRaises(ValueError, lambda: c.merge(['string']))

        self.assertEqual(1, len(c.labels))

        dictTrans = c.merge([c2])
        self.assertEqual(1, len(c2.labels))
        self.assertEqual(2, len(c.labels))

        #start again, to send in a list
        c = oe.cluster(self.data)
        c2 = oe.cluster(self.data)
        c3 = oe.cluster(self.data)
        c.cluster('parent', 'kmeans', 'kmeans', K=2)
        c2.cluster('parent', 'kmeans', 'kmeans', K=2)
        c2.cluster('parent', 'kmeans', 'kmeans_another', K=2)
        c3.cluster('parent', 'kmeans', 'kmeans', K=2)
        dictTrans = c.merge([c2, c3])
        self.assertEqual(4, len(c.labels))
Пример #8
0
    def test_cluster_slice(self):
        c = oe.cluster(self.data)
        c.cluster('parent', 'kmeans', 'kmeans_0', K=2)
        c.cluster('parent', 'kmeans', 'kmeans_1', K=2)
        c.cluster('parent', 'kmeans', 'kmeans_2', K=2)
        self.assertEqual(3, len(c.labels))

        names = ['kmeans_1', 'kmeans_2']
        cNew = c.slice(names)
        self.assertEqual(2, len(cNew.labels))
        self.assertEqual(2, len(cNew.params))
        self.assertEqual(2, len(cNew.clusterNumbers))
        self.assertEqual(2, len(cNew.data_source))

        names = ['kmeans_2', 'gooblygook']
        self.assertRaises(ValueError, lambda: c.slice(names))
Пример #9
0
    def test_all_algorithms(self):
        """
        Test all algorithms with default parameters
        """
        c = oe.cluster(self.data)
        ALG_FCN_DICT = c.algorithms_available()
        num = 0

        #remove MeanShift, which cannot be used in this dataset
        del ALG_FCN_DICT['MeanShift']

        for algorithm in ALG_FCN_DICT:
            name = algorithm + 'parent'
            c.cluster('parent', algorithm, name, K=2)
            num += 1
        self.assertEqual(num, len(c.labels))
Пример #10
0
    def slice(self, names):
        """
		Returns a new cluster object containing a slice indicated by the list of names given (dictionary keys shared amongst labels, params, etc.)

		Parameters
		----------
		names: list
			A list of strings matching the names to keep in the new slice

		Returns
		--------
		c: an openensembles clustering object
			A oe.cluster object that contains only those names passed in

		Examples
		--------
		Get only the solutions made by agglomerative clustering

		>>> names = c.search_field('algorithm', 'agglomerative') #return all solutions with agglomerative
		>>> cNew = c.slice(names)

		Get only the solutions that were made with K=2 calls

		>>> names = c.search_field('K', 2) #return all solution names that used K=2
		>>> cNew = c.slice(names)

		Raises
		------
		ValueError
			If a name in the list of names does not exist in cluster object


		"""
        c = oe.cluster(self.dataObj)
        names_existing = list(self.labels.keys())
        for name in names:
            if name not in names_existing:
                raise ValueError(
                    "ERROR: the source you requested for slicing does not exist in cluster object %s"
                    % (name))
            c.labels[name] = self.labels[name]
            c.data_source[name] = self.data_source[name]
            c.params[name] = self.params[name]
            c.clusterNumbers[name] = self.clusterNumbers[name]
            c.algorithms[name] = self.algorithms[name]
        return c
Пример #11
0
    def test_cluster_search_field(self):
        self.data.transform('parent', 'zscore', 'zscore', axis=0)
        c = oe.cluster(self.data)

        c.cluster('parent', 'kmeans', 'parent_kmeans_2', K=2)
        c.cluster('zscore', 'kmeans', 'kmeans_3', K=3)
        c.cluster('zscore',
                  'agglomerative',
                  'zscore_agglom_ward',
                  K=2,
                  linkage='ward')
        c.cluster('zscore',
                  'agglomerative',
                  'zscore_agglom_complete',
                  K=2,
                  linkage='complete')
        self.assertEqual(4, len(c.labels))

        #test for algorithm
        names = c.search_field('algorithm', 'kmeans')
        self.assertEqual(2, len(names))

        #test for data_source
        names = c.search_field('data_source', 'parent')
        self.assertEqual(1, len(names))
        self.assertEqual('parent_kmeans_2', names[0])

        #test for cluster number
        names = c.search_field('clusterNumber', 3)
        self.assertEqual(1, len(names))
        self.assertEqual('kmeans_3', names[0])

        #test for K
        names = c.search_field('K', 2)
        self.assertEqual(3, len(names))

        #test for linkage
        names = c.search_field('linkage', 'ward')
        self.assertEqual(1, len(names))
        self.assertEqual('zscore_agglom_ward', names[0])

        #test for no parameter of that type found
        self.assertRaises(ValueError, lambda: c.search_field('gobbly', 'gook'))
Пример #12
0
def run_mv_oe(X, y=None):
    """Deprecated"""
    print("a")
    n_features = X.shape[1]
    columns = [f"x{i}" for i in range(n_features)]

    df = pd.DataFrame(X, columns=columns)
    dataObj = oe.data(df, list(range(n_features)))

    c = oe.cluster(dataObj)
    c_MV_arr = []

    for i in range(30):
        name = f'kmeans_{i}'
        c.cluster('parent', 'kmeans', name, K=15, init='random', n_init=1)
        c_MV_arr.append(c.finish_majority_vote(threshold=0.5))

    final_labels = c_MV_arr[-1].labels['majority_vote'] - 1
    print(len(np.unique(final_labels)))

    return X, final_labels, y if len(np.unique(final_labels)) > 1 else run_mv_oe(X, y)
Пример #13
0
    def finish_graph_closure(self, threshold, clique_size=3):
        """ 
		The finishing technique that treats the co-occurrence matrix as a graph, that is binarized by the threshold (>=threshold 
		becomes an unweighted, undirected edge in an adjacency matrix). This graph object is then subjected to clique formation
		according to clique_size (such as triangles if clique_size=3). The cliques are then combined in the graph to create unique
		cluster formations. 

		See also
		--------
		finishing.py 

		Returns
		-------
		c: openenembles clustering object
			New cluster object with final solution and name 'graph_closure'

		Examples
		--------
		>>> cGraph = c.finish_graph_closure(0.5, 3)
		>>> d.plot_data('parent', cluster_labels=cGraph.labels['graph_closure'])

		"""
        params = {}
        params['threshold'] = threshold
        params['clique_size'] = clique_size
        coMatObj = self.co_occurrence_matrix('parent')

        c_G = finish.graph_closure(coMatObj.co_matrix,
                                   threshold,
                                   clique_size=clique_size)
        c_G.finish()
        c = oe.cluster(self.dataObj)
        name = 'graph_closure'
        c.labels[name] = c_G.labels
        c.params[name] = params
        c.data_source[name] = 'parent'
        c.clusterNumbers[name] = np.unique(c.labels[name])
        c.algorithms[name] = 'graph_closure'
        return c
Пример #14
0
    def test_distance_requirements_clustering(self):
        c = oe.cluster(self.data)

        self.assertRaises(
            ValueError, lambda: c.cluster('parent',
                                          'agglomerative',
                                          'agglomerative',
                                          K=2,
                                          linkage='complete',
                                          distance='precomputed'))
        self.assertRaises(
            ValueError, lambda: c.cluster(
                'parent', 'spectral', 'spectral', K=2, affinity='precomputed'))

        D = ca.returnDistanceMatrix(self.data.D['parent'], 'euclidean')
        S = ca.convertDistanceToSimilarity(D)
        self.assertRaises(
            ValueError, lambda: c.cluster('parent',
                                          'spectral',
                                          'spectral',
                                          K=2,
                                          distance='precomputed',
                                          M=D))
        c.cluster('parent',
                  'spectral',
                  'spectral',
                  K=2,
                  affinity='precomputed',
                  M=S)
        self.assertEqual(1, len(c.labels))

        c.cluster('parent',
                  'DBSCAN',
                  'DBSCAN',
                  K=2,
                  affinity='precomputed',
                  M=D)
        self.assertEqual(2, len(c.labels))
Пример #15
0
    def finish_majority_vote(self, threshold=0.5):
        """

		Based on Ana Fred's 2001 paper: Fred, Ana. Finding Consistent Clusters in Data Partitions. In Multiple Classifier Systems, 
		edited by Josef Kittler and Fabio Roli, LNCS 2096, 309-18. Springer, 2001. 
		This algorithm assingns clusters to the same class if they co-cluster at least 50 of the time. It 
		greedily joins clusters with the evidence that at least one pair of items from two different clusters co-cluster 
		a majority of the time. Outliers will get their own cluster. 

		Parameters
		----------
		threshold: float
			the threshold, or fraction of times objects co-cluster to consider a 'majority'. Default is 0.5 (50% of the time)

		Returns
		-------
		c: openensembles cluster object
			New cluster object with final solution and name 'majority_vote'

		Examples
		--------
		>>> c_MV = c.majority_vote(threshold=0.7)
		>>> labels = c_MV.labels['majority_vote']
		"""
        params = {}
        coMatObj = self.co_occurrence_matrix('parent')
        c_MV = finish.majority_vote(coMatObj.co_matrix, threshold)
        c_MV.finish()

        c = oe.cluster(self.dataObj)
        name = 'majority_vote'
        c.labels[name] = c_MV.labels
        c.params[name] = params
        c.data_source[name] = 'parent'
        c.clusterNumbers[name] = np.unique(c.labels[name])
        c.algorithms[name] = 'majority_vote'
        return c
Пример #16
0
    def test_validation_merge(self):
        c = oe.cluster(self.data)
        c.cluster('parent',
                  'kmeans',
                  'kmeans_1',
                  K=2,
                  random_seed=0,
                  init='random',
                  n_init=1)
        c.cluster('parent',
                  'kmeans',
                  'kmeans_2',
                  K=2,
                  random_seed=0,
                  init='random',
                  n_init=1)
        c.cluster('parent',
                  'kmeans',
                  'kmeans_3',
                  K=2,
                  random_seed=0,
                  init='random',
                  n_init=1)

        v = oe.validation(self.data, c)
        v2 = oe.validation(self.data, c)
        v3 = oe.validation(self.data, c)

        v.calculate('silhouette', 'kmeans_1', 'parent')
        v2.calculate('silhouette', 'kmeans_2', 'parent')
        v3.calculate('silhouette', 'kmeans_3', 'parent')

        self.assertEqual(1, len(v.validation.keys()))
        self.assertRaises(ValueError, lambda: v.merge(['string']))
        v.merge([v2, v3])
        self.assertEqual(3, len(v.validation.keys()))
# --- SECTION 1 ---
# Libraries and data loading
import openensembles as oe
import numpy as np
import pandas as pd
import sklearn.metrics

from sklearn.datasets import load_breast_cancer
from sklearn.manifold import TSNE

bc = load_breast_cancer()
t = TSNE()
# --- SECTION 2 ---
# Create the data object
cluster_data = oe.data(pd.DataFrame(t.fit_transform(bc.data)), [0, 1])

np.random.seed(123456)
# --- SECTION 3 ---
# Create the ensembles and calculate the homogeneity score
for K in [2, 3, 4, 5, 6, 7]:
    for ensemble_size in [3, 4, 5]:
        ensemble = oe.cluster(cluster_data)
        for i in range(ensemble_size):
            name = f'kmeans_{ensemble_size}_{i}'
            ensemble.cluster('parent', 'kmeans', name, K)

        preds = ensemble.finish_majority_vote(threshold=0.5)
        print(f'K: {K}, size {ensemble_size}:', end=' ')
        print('%.2f' % sklearn.metrics.homogeneity_score(
            bc.target, preds.labels['majority_vote']))
Пример #18
0
 def test_clustering_setup(self):
     c = oe.cluster(self.data)
     self.assertEqual(1, len(c.dataObj.D))
Пример #19
0
import pandas as pd
from sklearn import datasets
import openensembles as oe
import matplotlib.pyplot as plt
import seaborn as sns
#Set up a dataset and put in pandas DataFrame.
x, y = datasets.make_moons(n_samples=200, shuffle=True, noise=0.02, random_state=None)
df = pd.DataFrame(x)
#instantiate the oe data object
dataObj = oe.data(df, [1,2])
#instantiate an oe clustering object
c = oe.cluster(dataObj)
c_MV_arr = []
val_arr = []
for i in range(0,39):
    # add a new clustering solution, with a unique name
    name = 'kmeans_' + str(i)
    c.cluster('parent', 'kmeans', name, K=16, init = 'random', n_init = 1)
    # calculate a new majority vote solution, where c has one more solution on each iteration
    c_MV_arr.append(c.finish_majority_vote(threshold=0.5))
    #calculate the determinant ratio metric for each majority vote solution
    v = oe.validation(dataObj, c_MV_arr[i])
    val_name = v.calculate('det_ratio', 'majority_vote', 'parent')
    val_arr.append(v.validation[val_name])

#calculate the co-occurrence matrix
coMat = c.co_occurrence_matrix()
coMat.plot(labels=False)
Пример #20
0
 def test_clustering_NoAlgorithm(self):
     c = oe.cluster(self.data)
     self.assertRaises(ValueError,
                       lambda: c.cluster('parent', 'gobblygook', 'bad'))
Пример #21
0
d = oe.data(df, [i for i in range(1, len(df.columns) + 1)])
'''
WHAT NEEDS TO BE ADDED FOR FUTURE USAGE FROM OTHERS:
    a) After loading data
        Ensure everything is either normalized or handle in this cell before creating the ensemble
        
    b) Ensure categorical features are encoded
    
    Code cell accidently deleted for encoding but pandas has useful tool for easy encoding
    https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html

'''

# pass oe dataobj to cluster class
c = oe.cluster(
    d)  #instantiate an object so we can get all available algorithms

# Call this to list all algorithms currently available in algorithms.py
# These are the algorithms available
# [kmeans,spectral,agglomerative, DBSCAN,HDBSCAN,AffinityPropagation,Birch,MeanShift,GaussianMixture,]
a = c.algorithms_available()

# returns Keys equal to parameters {K, linkages, distances} and values as lists of algorithms that use that key as a variable
# Example :
paramsC = c.clustering_algorithm_parameters(
)  #here we will rely on walking through
print(paramsC)
# remove DBSCAN -- this does very well on unstructured data, we want to ask if we can use poorly performing algorithms
# to identify if there isn't structure.
# 'DBSCAN', 'Birch', 'GaussianMixture', 'HDBSCAN', 'MeanShift', 'AffinityPropagation','kmeans', 'agglomerative', 'spectral'
algorithmsToRemove = [
Пример #22
0
 def test_clustering_NoSource(self):
     c = oe.cluster(self.data)
     self.assertRaises(ValueError,
                       lambda: c.cluster('parentZ', 'kmeans', 'bad'))