def consistency_selection_ensemble(labels, mlset, nlset, logger, must_threshold, cannot_threshold, normalized=True, weighted=False, weighted_type='both', alpha=1): """ do selection ensemble using must/cannot consistency as criteria clusteing with k smaller than k_threshold will be removed :param labels: :param mlset: :param nlset: :param logger: :param must_threshold: :param cannot_threshold: :param normalized: :param weighted: :param weighted_type: :param alpha: :return: """ class_num = len(np.unique(labels[-1])) must_consistencies = [] cannot_consistencies = [] clustering_weights = [] cluster_level_weights = [] k_value = [] for label in labels[0:-5]: must_cons = Metrics.consistency(label, mlset, nlset, cons_type='must') cannot_cons = Metrics.consistency(label, mlset, nlset, cons_type='cannot') if weighted: clustering_weights.append( Metrics.consistency(label, mlset, nlset, cons_type=weighted_type)) cluster_level_weights.append( Metrics.consistency_per_cluster(label, mlset, nlset, cons_type=weighted_type)) must_consistencies.append(must_cons) cannot_consistencies.append(cannot_cons) k_value.append(len(np.unique(label))) if normalized: scaler = preprocessing.MinMaxScaler() must_consistencies = scaler.fit_transform( np.array(must_consistencies).reshape(-1, 1)).ravel() cannot_consistencies = scaler.fit_transform( np.array(cannot_consistencies).reshape(-1, 1)).ravel() idx = np.logical_and(must_consistencies >= must_threshold, cannot_consistencies >= cannot_threshold) selected_labels = labels[0:-5][idx] k_value = np.array(k_value)[idx] logger.debug('[Consistency] Start consensus...shape=' + str(selected_labels.shape)) if selected_labels.shape[0] == 0: logger.debug('[Consistency] No clusterings are selected. Out.') return logger.debug('[Consistency] Average k is ' + str(np.mean(k_value))) label_CSPA = ce.cluster_ensembles_CSPAONLY( selected_labels, N_clusters_max=class_num, weighted=weighted, clustering_weights=clustering_weights, cluster_level_weights=cluster_level_weights, alpha=alpha) label_HGPA = ce.cluster_ensembles_HGPAONLY( selected_labels, N_clusters_max=class_num, weighted=weighted, clustering_weights=clustering_weights, cluster_level_weights=cluster_level_weights, alpha=alpha) label_MCLA = ce.cluster_ensembles_MCLAONLY(selected_labels, N_clusters_max=class_num) nmi_CSPA = Metrics.normalized_max_mutual_info_score(label_CSPA, labels[-1]) nmi_HGPA = Metrics.normalized_max_mutual_info_score(label_HGPA, labels[-1]) nmi_MCLA = Metrics.normalized_max_mutual_info_score(label_MCLA, labels[-1]) logger.debug('CSPA performance:' + str(nmi_CSPA)) logger.debug('HGPA performance:' + str(nmi_HGPA)) logger.debug('MCLA performance:' + str(nmi_MCLA)) return
def k_selection_ensemble(labels, k_threshold, logger, weighted=False, alpha=0, mlset=None, nlset=None, ctype='both'): """ do selection ensemble using k as criteria clusteing with k smaller than k_threshold will be removed :param labels: :param k_threshold: :param logger: :param weighted: weighted version or not :param alpha: balance factor that control the importance of clustering/cluster consistency in weights (weighted version only) :param mlset: cannot-link set (weighted version only) :param nlset: must-link set (weighted version only) :param ctype: type of consistency (weighted version only) :return: """ k_value = [] class_num = len(np.unique(labels[-1])) # select those clusterings that k larger than the threshold. for label in labels[0:-5]: k_value.append(len(np.unique(label))) k_value = np.array(k_value) idx = k_value.ravel() >= k_threshold selected_labels = labels[0:-5][idx] # weights con_per_cluster = [] con_clustering = [] if weighted: for label in selected_labels: con_per_cluster.append( Metrics.consistency_per_cluster(label, mlset, nlset, cons_type=ctype)) for label in selected_labels: con_clustering.append( Metrics.consistency(label, mlset, nlset, cons_type=ctype)) logger.debug('[K] Start consensus...shape=' + str(selected_labels.shape)) logger.debug('[K] Average k is ' + str(np.mean(k_value[idx]))) if weighted: logger.debug('[K] weighted consensus, alpha=' + str(alpha)) label_CSPA = ce.cluster_ensembles_CSPAONLY( selected_labels, N_clusters_max=class_num, weighted=weighted, clustering_weights=con_clustering, cluster_level_weights=con_per_cluster, alpha=alpha) label_HGPA = ce.cluster_ensembles_HGPAONLY( selected_labels, N_clusters_max=class_num, weighted=weighted, clustering_weights=con_clustering, cluster_level_weights=con_per_cluster, alpha=alpha) label_MCLA = ce.cluster_ensembles_MCLAONLY( selected_labels, N_clusters_max=class_num, weighted=weighted, clustering_weights=con_clustering, cluster_level_weights=con_per_cluster, alpha=alpha) nmi_CSPA = Metrics.normalized_max_mutual_info_score(label_CSPA, labels[-1]) nmi_HGPA = Metrics.normalized_max_mutual_info_score(label_HGPA, labels[-1]) nmi_MCLA = Metrics.normalized_max_mutual_info_score(label_MCLA, labels[-1]) logger.debug('CSPA performance:' + str(nmi_CSPA)) logger.debug('HGPA performance:' + str(nmi_HGPA)) logger.debug('MCLA performance:' + str(nmi_MCLA)) logger.debug('--------------------------------------------') return
def do_7th_weighted_ensemble_for_library( library_folder, library_name, class_num, target, constraint_file, logger, alphas, internals, cons_type='both', ensemble_method=_default_ensemble_method, scale=False): """ :param library_folder: :param library_name: :param class_num: :param target: :param constraint_file: :param logger: :param alphas: :param cons_type: :param ensemble_method :return: """ logger.debug( '===========================================================================================' ) logger.debug('-----------------New Weighted Ensemble for library:' + str(library_name) + '-------------------') logger.debug('-----------------Weight type = ' + cons_type + '-------------------------------------------') logger.debug('-----------------Scale type = ' + str(scale) + '-------------------------------------------') logger.debug('-----------------Constraint File name = ' + constraint_file + '----------------------------') labels = np.loadtxt(library_folder + library_name + '.res', delimiter=',') labels = labels.astype(int) k_values = [] expected_cons = {} # if the library is not pure, i.e, ensemble results and targets are also included. # then, last 5 rows should be removed (single kmeans, cspa, hgpa, mcla, real labels) if 'pure' not in library_name: labels = labels[0:-5] mlset, nlset = io_func.read_constraints(constraint_file) # get cluster/clustering level weights con_per_cluster = [] con_clustering = [] for label in labels: con_per_cluster.append( Metrics.consistency_per_cluster(label, mlset, nlset, cons_type=cons_type)) for label in labels: con_clustering.append( Metrics.consistency(label, mlset, nlset, cons_type=cons_type)) k_values.append(len(np.unique(label))) k_values = np.array(k_values, dtype=int) possible_k = np.unique(k_values) cons = np.array(con_clustering) for k in possible_k: mean_value = np.mean(cons[k_values == k]) if mean_value == 0: mean_value = 1 expected_cons[k] = mean_value for i in range(0, labels.shape[0]): con_clustering[i] /= expected_cons[k_values[i]] con_clustering[i] *= internals[i] if scale: scaler = preprocessing.MinMaxScaler() con_clustering = scaler.fit_transform(np.array(con_clustering)) nmis = [] for alpha in alphas: logger.debug( '-------------------------->>>>>> PARAM START <<<<<<<---------------------------------' ) cur_nmis = [] for method in ensemble_method: ensemble_labels = _ensemble_method[method]( labels, N_clusters_max=class_num, weighted=True, clustering_weights=con_clustering, cluster_level_weights=con_per_cluster, alpha=alpha) ensemble_nmi = Metrics.normalized_max_mutual_info_score( ensemble_labels, target) logger.debug(method + ' alpha=' + str(alpha) + ', NMI=' + str(ensemble_nmi)) cur_nmis.append(ensemble_nmi) nmis.append(cur_nmis) logger.debug( '------------------------->>>>>> END OF THIS PARAM <<<<<<-------------------------------' ) logger.debug( '===========================================================================================' ) return nmis