def algorithms_available(self): """ Call this to list all algorithms currently available in algorithms.py """ algorithms = ca.clustering_algorithms(self.dataObj.D['parent'], {}) ALG_FCN_DICT = algorithms.clustering_algorithms_available() return ALG_FCN_DICT
def cluster(self, source_name, algorithm, output_name, K=None, Require_Unique=False, random_seed=None, **kwargs): """ This runs clustering algorithms on the data matrix defined by source_name with parameters that are variable for each algorithm. Note that K is required for most algorithms. Parameters ---------- source_name: string the source data matrix name to operate on in clusterclass dataObj algorithm: string name of the algorithm to use, see clustering.py or call oe.cluster.algorithms_available() output_name: string this is the dict key for interacting with the results of this clustering solution in any of the cluster class dictionary attributes K: int number of clusters to create (ignored for algorithms that define K during clustering). The var_params gets K after, either the parameter passed, or the number of clusters produced if the K was not passed. Require_Unique: bool If FALSE and you already have an output_name solution, this will append a number to create a unique name. If TRUE and a solution by that name exists, this will not add solution and raise ValueError. Default Require_Unique=False random_seed: int or random.getstate() Pass a random seed or random seed state (random.getstate()) in order to force the starting point of a clustering algorithm to that state. Default is None Warnings -------- This will warn if the number of clusters is differen than what was requested, typically when an algorithm does not accept K as an argument. Raises ------ ValueError if data source is not available by source_name Examples -------- Cluster using KMeans on parent data >>> c = oe.cluster >>> c.cluster('parent', 'kmeans','kmeans_parent', K=5) Form an iteration to build an ensemble using different values for K >>> for k in range(2,12): >>> name='kmeans_'+k >>> c.cluster('parent', 'kmeans', name, k) """ #CHECK that the source exists if source_name not in self.dataObj.D: raise ValueError( "ERROR: the source you requested for clustering does not exist by that name %s" % (source_name)) ALG_FCN_DICT = self.algorithms_available() paramDict = {} if not kwargs: var_params = {} else: var_params = kwargs #Here if handle if random seed was passed, set it. Else, store the random seed. if 'random_seed': try: random.set_state(random_seed) state = random_seed except TypeError: random.seed(random_seed) state = random.get_state() var_params['random_state'] = state ##### Check to see if the same name exists for clustering solution name and decide what to do according to Require_Unique if output_name in list(self.labels.keys()): if Require_Unique: warnings.warn( 'The name of the clustering solution is redundant and you required unique, solution will not be added' ) return else: test_name = "%s_%d" % (output_name, randint(0, 10000)) while test_name in self.labels: test_name = "%s_%d" % (output_name, randint(0, 10000)) output_name = test_name warnings.warn( 'For uniqueness, altered output_name to be %s' % (output_name), UserWarning) ######BEGIN CLUSTERING BLOCK ###### if algorithm not in ALG_FCN_DICT: raise ValueError( "The algorithm you requested does not exist, currently the following are supported %s" % (list(ALG_FCN_DICT.keys()))) random.set_state(state) c = ca.clustering_algorithms(self.dataObj.D[source_name], var_params, K) func = getattr(c, algorithm) func() #### FINAL staging, c now contains a finished assignment and c.params has final parameters used. # CHECK that K is as requested uniqueClusters = np.unique(c.out) if K: #check if K was overwritten c.var_params['K'] = K if len(uniqueClusters) != K: warnings.warn( "Number of unique clusters %d returned does not match number requested %d for solution: %s" % (len(uniqueClusters), K, output_name), UserWarning) else: c.var_params['K'] = len(uniqueClusters) self.labels[output_name] = c.out self.data_source[output_name] = source_name self.params[output_name] = c.var_params self.clusterNumbers[output_name] = uniqueClusters self.algorithms[output_name] = algorithm self.random_state[output_name] = state