def test_get_query_and_evaluation_analysis_types(self): parameters = { "clustering":{ "evaluation": { "evaluation_criteria": { "criteria_0": { "CythonMirrorCohesion":{ "action": ">", "weight": 0.05 }, "CythonMinimumMeanSeparation":{ "action": ">", "weight": 0.1 }, "CythonSilhouette":{ "action": ">", "weight": 0.15 } } }, "query_types": [ "NumClusters", "CythonMinimumMeanSeparation", "NoiseLevel" ] } } } self.assertItemsEqual( AnalysisPopulator.get_evaluation_analysis_types(parameters), ['CythonMinimumMeanSeparation', 'CythonMirrorCohesion', 'CythonSilhouette']) self.assertItemsEqual( AnalysisPopulator.get_query_and_evaluation_analysis_types(parameters), ['CythonMinimumMeanSeparation', 'NumClusters', 'CythonMirrorCohesion', 'NoiseLevel', 'CythonSilhouette'])
def choose_best(self, clustering_info): """ Normalizes the values of the evaluation scores, then calculates the scores for all clusterings and criteria and finally chooses the best clustering. @param clustering_info: Is the clustering_info structure with clusterings, evaluation info... etc @return: The id of the best clustering with the criteria_id with higher score and the score itself. """ if len(clustering_info) == 0: print "[WARNING BestClusteringSelector::choose_best] clustering_info is empty." return None evaluation_types = AnalysisPopulator.get_evaluation_analysis_types(self.parameters) # If there were no criteria defined, then the clustering is randomly selected if evaluation_types == []: return clustering_info[clustering_info.keys()[random.randint(0,len(clustering_info.keys())-1)]] for evaluation_type in evaluation_types: BestClusteringSelector.normalize_one_evaluation_type(evaluation_type, clustering_info) scores = BestClusteringSelector.get_scores_for_all_clusters_and_criterias(self.criteria, clustering_info) best_clustering_id, criteria_id, scores = self.get_best_clustering(scores) return best_clustering_id, scores
def choose_best(self, clustering_info): """ Normalizes the values of the evaluation scores, then calculates the scores for all clusterings and criteria and finally chooses the best clustering. @param clustering_info: Is the clustering_info structure with clusterings, evaluation info... etc @return: The id of the best clustering with the criteria_id with higher score and the score itself. """ if len(clustering_info) == 0: print "[WARNING BestClusteringSelector::choose_best] clustering_info is empty." return None evaluation_types = AnalysisPopulator.get_evaluation_analysis_types( self.parameters) # If there were no criteria defined, then the clustering is randomly selected if evaluation_types == []: return clustering_info[clustering_info.keys()[random.randint( 0, len(clustering_info.keys()) - 1)]] for evaluation_type in evaluation_types: BestClusteringSelector.normalize_one_evaluation_type( evaluation_type, clustering_info) scores = BestClusteringSelector.get_scores_for_all_clusters_and_criterias( self.criteria, clustering_info) best_clustering_id, criteria_id, scores = self.get_best_clustering( scores) return best_clustering_id, scores
def run(self, clustering_parameters, matrixHandler, workspaceHandler, trajectoryHandler): ############################ # Clustering exploration ############################ self.notify("Exploration Started", []) self.timer.start("Clustering Exploration") clusterings = ClusteringExplorer( clustering_parameters, matrixHandler, workspaceHandler, scheduling_tools.build_scheduler( clustering_parameters["global"]["control"], self.observer), AlgorithmRunParametersGenerator(clustering_parameters, matrixHandler), self.observer).run() self.notify("Clusterings Created", {"number_of_clusters": len(clusterings)}) self.timer.stop("Clustering Exploration") ###################### # First filtering ###################### self.timer.start("Clustering Filtering") selected_clusterings, not_selected_clusterings = ClusteringFilter( clustering_parameters["clustering"]["evaluation"], matrixHandler).filter(clusterings) self.notify( "Filter", { "selected": len(selected_clusterings.keys()), "not_selected": len(not_selected_clusterings.keys()) }) self.timer.stop("Clustering Filtering") if selected_clusterings == {}: return None ###################### # Clustering scoring ###################### self.timer.start("Evaluation") analyzer = AnalysisRunner( scheduling_tools.build_scheduler( clustering_parameters["global"]["control"], self.observer), selected_clusterings, AnalysisPopulator(matrixHandler, trajectoryHandler, clustering_parameters)) analyzer.evaluate() self.timer.stop("Evaluation") ###################### # Choose the best clustering ###################### self.timer.start("Selection") best_clustering_id, all_scores = BestClusteringSelector( clustering_parameters).choose_best(selected_clusterings) self.timer.stop("Selection") return best_clustering_id, selected_clusterings, not_selected_clusterings, all_scores
def __init__(self ,parameters): AnalysisPopulator.__init__(self, "", "", parameters)
def run(self, clustering): """ Refine a clustering recursively using a k-means over each cluster. New clusters obtained from a cluster must have no noise and """ max_partitions = self.refinement_parameters["max_partitions"] try_step = int( max( 1, float(max_partitions) / self.refinement_parameters["tries_per_cluster"])) matrix = self.matrixHandler.distance_matrix new_clusters = [] for cluster in clustering.clusters: base_id = cluster.id # The initial clustering is added to the list of new clusters. # With this 'trick' the initial cluster also enters the competition for the best clustering price. clusterings = { base_id: { "type": "refined_base", "clustering": Clustering([cluster]), "parameters": {} } } submatrix = get_submatrix(matrix, cluster.all_elements) # Proceed with some K Medoids partitions # TODO: Generate parameters with parameter generator for k in range(2, max_partitions, try_step): clustering = self.repartition_with_kmedoids( cluster, k, submatrix) clusterings["%s_%d" % (base_id, k)] = { "type": "refined", "clustering": clustering, "parameters": { "k": k } } # Evaluate all clusterings and pick the best one AnalysisRunner( scheduling_tools.build_scheduler( self.clustering_parameters["clustering"]["control"], self.observer), clusterings, AnalysisPopulator(self.matrixHandler, self.trajectoryHandler, self.clustering_parameters)).evaluate() best_clustering_id, all_scores = BestClusteringSelector( self.clustering_parameters).choose_best( clusterings) # @UnusedVariable new_clusters.extend( clusterings[best_clustering_id]["clustering"].clusters) # Convert all new clusters in the new clustering return { "type": "refined_clustering", "clustering": Clustering(new_clusters), "parameters": self.refinement_parameters }