def main(): datamat,datalabels = loadDataset("../dataset/lung-cancer.data") print 'data ready' sampledData, remainedData, sampledIndex, remainedIndex = data_sample(datamat,1,10) print 'sampledData ready' pop_kmeans = rsnn(sampledData, remainedData, sampledIndex, remainedIndex,'kmeans') print 'kmeans end' pop_ward = rsnn(sampledData, remainedData, sampledIndex, remainedIndex,'ward') print 'ward end' pop_complete = rsnn(sampledData, remainedData, sampledIndex, remainedIndex,'complete') print 'complete end' pop_average = rsnn(sampledData, remainedData, sampledIndex, remainedIndex,'average') print 'average end' pop = [] pop.extend(pop_kmeans) pop.extend(pop_ward) pop.extend(pop_complete) pop.extend(pop_average) hdf5_file_name = './Cluster_Ensembles.h5' fileh = tables.open_file(hdf5_file_name, 'w') fileh.create_group(fileh.root, 'consensus_group') fileh.close() pop = np.array(pop) hypergraph_adjacency = build_hypergraph_adjacency(pop) store_hypergraph_adjacency(hypergraph_adjacency, hdf5_file_name) consensus_clustering_labels = CE.MCLA(hdf5_file_name, pop, verbose=True, N_clusters_max=10) nmi = normalized_mutual_info_score(datalabels, consensus_clustering_labels) ari = adjusted_rand_score(datalabels, consensus_clustering_labels) print('nmi值: ') print(nmi) print('ari值: ') print(ari)
def ensemble_crossover(population, index_arr): hdf5_file_name = './Cluster_Ensembles.h5' fileh = tables.open_file(hdf5_file_name, 'w') fileh.create_group(fileh.root, 'consensus_group') fileh.close() individuals = [] #用于交叉的父代个体的集合 clusters_num = [] # print int(round(len(population)*0.25)) for i in range(20): individuals.append(tournament(population, index_arr)) #二进制锦标赛法选择出父代个体 individuals = np.array(individuals) for j in range(len(individuals)): #交叉产生的聚类簇的范围 individual = individuals[j] aa = len(set(individual)) clusters_num.append(aa) sort_clustersNum = sorted( clusters_num) #sort对原list操作,但这里是set不能用sort(),只有用sorted() clusters_max = random.randint(sort_clustersNum[0], sort_clustersNum[-1] + 1) hypergraph_adjacency = build_hypergraph_adjacency(individuals) store_hypergraph_adjacency(hypergraph_adjacency, hdf5_file_name) consensus_clustering_labels = CE.MCLA(hdf5_file_name, individuals, verbose=True, N_clusters_max=clusters_max) ind_ensemble = creator.Individual(consensus_clustering_labels) print('交叉后的结果是:%s' % ind_ensemble) return ind_ensemble
def cluster_ensembles(cluster_runs, verbose, N_clusters_max, method): hdf5_file_name = 'tmp_graph' fileh = tables.open_file(hdf5_file_name, 'w') fileh.create_group(fileh.root, 'consensus_group') fileh.close() hypergraph_adjacency = build_hypergraph_adjacency(cluster_runs) store_hypergraph_adjacency(hypergraph_adjacency, hdf5_file_name) cluster_ensemble = method(hdf5_file_name, cluster_runs, verbose, N_clusters_max) score = ceEvalMutual(cluster_runs, cluster_ensemble, verbose) return score, cluster_ensemble
def all_ensemble(population, k): hdf5_file_name = './Cluster_Ensembles.h5' fileh = tables.open_file(hdf5_file_name, 'w') fileh.create_group(fileh.root, 'consensus_group') fileh.close() pop = [] for i in range(len(population)): ind = [] ind.extend(population[i]) pop.append(ind) pop = np.array(pop) hypergraph_adjacency = build_hypergraph_adjacency(pop) store_hypergraph_adjacency(hypergraph_adjacency, hdf5_file_name) consensus_clustering_labels = CE.MCLA(hdf5_file_name, pop, verbose=True, N_clusters_max=k + 2) return consensus_clustering_labels
def moclenew(datamat): # datamat,datalabels = loadDataset("../dataset/glass.data") print 'data ready' pop_kmeans = ini_population(datamat, 'kmeans', 10) print 'kmeans end' pop_ward = ini_population(datamat, 'ward', 10) print 'ward end' pop_complete = ini_population(datamat, 'complete', 10) print 'complete end' pop_average = ini_population(datamat, 'average', 10) print 'average end' # pop_spc = ini_population(datamat, 'spc', 1) # print 'spc end' pop = [] pop.extend(pop_kmeans) pop.extend(pop_complete) pop.extend(pop_average) # pop.extend(pop_spc) init_population = [] for indiv1 in pop: ind1 = creator.Individual(indiv1) init_population.append(ind1) filter_pop = filter(lambda x: len(x) > 0, init_population) ##去除初始化聚类失败的结果 population = filter_pop #population是总的种群,后续的交叉算法的结果也要添加进来 #为里第二个目标函数所用的矩阵,每个数据点的距离矩阵,计算一半 # dataLen = len(datamat) # distances_matrix = zeros((dataLen, dataLen)) # for datai in range(dataLen): # for dataj in range(datai+1,dataLen): # distances_matrix[datai][dataj] = Euclidean_dist(datamat[datai],datamat[dataj]) distances_matrix = pairwise_distances(datamat, metric='euclidean') # 数据集中数据点两两之间的距离 print "数据点距离矩阵计算完毕" invalid_ind = [ind for ind in population if not ind.fitness.valid] for ind in invalid_ind: euDistance, eu_connect = mocle_index(datamat, distances_matrix, ind) fitnesses = (euDistance, eu_connect) ind.fitness.values = fitnesses # fitnesses = toolbox.map(toolbox.evaluate, tile(datamat,(len(invalid_ind),1,1)),tile(distances_matrix,(len(invalid_ind),1,1)),invalid_ind) # # for ind, fit in zip(invalid_ind, fitnesses): # ind.fitness.values = fit # population = toolbox.select(population, len(population)) popeliteLen = len(population) for i in range(generation): print '第%s代' % i popElite = toolbox.select(population, popeliteLen) # Vary the population # parentSpring = tools.selTournamentDCD(popElite, popeliteLen) # parentSpring = [toolbox.clone(ind) for ind in parentSpring] newoffspring = [] # applying crossover popcrossover = toolbox.select(population, 2) k1 = len(list(set(popcrossover[0]))) k2 = len(list(set(popcrossover[1]))) if k1 <= k2: k = random.randint(k1, k2 + 1) else: k = random.randint(k2, k1 + 1) # 其他聚类集成算子 hdf5_file_name = './Cluster_Ensembles.h5' fileh = tables.open_file(hdf5_file_name, 'w') fileh.create_group(fileh.root, 'consensus_group') fileh.close() popcrossover = np.array(popcrossover) hypergraph_adjacency = build_hypergraph_adjacency(popcrossover) store_hypergraph_adjacency(hypergraph_adjacency, hdf5_file_name) resultList = CE.MCLA(hdf5_file_name, popcrossover, verbose=True, N_clusters_max=k) ind_ensemble = creator.Individual(resultList) newoffspring.append(ind_ensemble) # evaluating fitness of individuals with invalid fitnesses invalid_ind = [ind for ind in newoffspring if not ind.fitness.valid] for ind1 in invalid_ind: euDistance1, eu_connect1 = mocle_index(datamat, distances_matrix, ind1) fitnesses1 = (euDistance1, eu_connect1) ind1.fitness.values = fitnesses1 # fitnesses = toolbox.map(toolbox.evaluate, tile(datamat,(len(invalid_ind),1,1)),tile(distances_matrix,(len(invalid_ind),1,1)),invalid_ind)#这里只用了未经处理的数据,没有用到真实类别 # # for ind, fit in zip(invalid_ind, fitnesses): # ind.fitness.values = fit # Chossing a population for the next generation # population = toolbox.select(popElite + newoffspring, popeliteLen) population = popElite + newoffspring result1 = toolbox.nondominated(population, len(population)) nondominated_result = result1[0] final_result, pbmValue = computePBM(datamat, nondominated_result) return final_result, pbmValue
def main(): # init_population,init_ari,datamat,datalabels = ini_Cluster(kNumber=6) #多种聚类算法产生初始种群 datamat, datalabels = loadDataset("../dataset/soybean-small.data") print 'data ready' pop_kmeans = initialMultiRun(datamat, 10, 'kmeans') print 'kmeans end' pop_ward = initialMultiRun(datamat, 10, 'ward') print 'ward end' pop_complete = initialMultiRun(datamat, 10, 'complete') print 'complete end' pop_average = initialMultiRun(datamat, 10, 'average') print 'average end' pop = [] pop.extend(pop_kmeans) pop.extend(pop_ward) pop.extend(pop_complete) pop.extend(pop_average) init_population = [] for indiv1 in pop: ind1 = creator.Individual(indiv1) init_population.append(ind1) filter_pop = filter(lambda x: len(x) > 0, init_population) ##去除初始化聚类失败的结果 population = filter_pop #population是总的种群,后续的交叉算法的结果也要添加进来 invalid_ind = [ind for ind in population if not ind.fitness.valid] fitnesses = toolbox.map(toolbox.evaluate, tile(datamat, (len(invalid_ind), 1, 1)), tile(population, (len(invalid_ind), 1, 1)), invalid_ind) for ind, fit in zip(invalid_ind, fitnesses): ind.fitness.values = fit population = toolbox.select(population, len(population)) for i in range(generation): print '第%s代' % i popElite = toolbox.select(population, int(round( len(population) * 0.5))) #top half from population # Vary the population parentSpring = tools.selTournamentDCD(population, len(population)) parentSpring = [toolbox.clone(ind) for ind in parentSpring] newoffspring = [] # applying crossover for indiv1, indiv2 in zip(parentSpring[::2], parentSpring[1::2]): randNum = random.random() # generate a random number from 0 to 1 if randNum < 0.8: toolbox.mate(indiv1, indiv2) toolbox.mutate(indiv1) toolbox.mutate(indiv2) del indiv1.fitness.values, indiv2.fitness.values newoffspring.append(indiv1) newoffspring.append(indiv2) else: hdf5_file_name = './Cluster_Ensembles.h5' fileh = tables.open_file(hdf5_file_name, 'w') fileh.create_group(fileh.root, 'consensus_group') fileh.close() individuals = [] individuals.append(indiv1) individuals.append(indiv2) individuals = np.array(individuals) hypergraph_adjacency = build_hypergraph_adjacency(individuals) store_hypergraph_adjacency(hypergraph_adjacency, hdf5_file_name) consensus_clustering_labels = CE.MCLA(hdf5_file_name, individuals, verbose=True, N_clusters_max=10) ind_ensemble = creator.Individual(consensus_clustering_labels) newoffspring.append(ind_ensemble) # evaluating fitness of individuals with invalid fitnesses invalid_ind = [ind for ind in newoffspring if not ind.fitness.valid] fitnesses = toolbox.map(toolbox.evaluate, tile(datamat, (len(invalid_ind), 1, 1)), tile(newoffspring, (len(invalid_ind), 1, 1)), invalid_ind) #这里只用了未经处理的数据,没有用到真实类别 for ind, fit in zip(invalid_ind, fitnesses): ind.fitness.values = fit # Chossing a population for the next generation population = toolbox.select(popElite + newoffspring, len(population)) result1 = toolbox.nondominated(population, len(population)) print len(result1) print result1 print len(result1[0]) print result1[0] print 'ari值' ari_arr = [] max_ari = -inf for ind in result1[0]: ari = adjusted_rand_score(datalabels, ind) ari_arr.append(ari) if ari > max_ari: max_ari = ari print ari_arr print max_ari nmi_arr = [] max_nmi = -inf print 'nmi值' for ind in result1[0]: nmi = normalized_mutual_info_score(datalabels, ind) nmi_arr.append(nmi) if nmi > max_nmi: max_nmi = nmi print nmi_arr print max_nmi
def multirun(datasetName): # datamat,datalabels = loadDataset("../dataset/glass.data") path = '../dataset/' + datasetName datamat, datalabels = loadDataset(path) print 'data ready' sampledData, remainedData, sampledIndex, remainedIndex = data_sample( datamat, 1, 2) print 'sampledData ready' pop_kmeans = rsnn(sampledData, remainedData, sampledIndex, remainedIndex, 'kmeans') print 'kmeans end' max_nmi1 = -inf for ind1 in pop_kmeans: nmi1 = normalized_mutual_info_score(datalabels, ind1) if nmi1 > max_nmi1: max_nmi1 = nmi1 print '初始kmeans最大nmi为%s' % max_nmi1 pop_ward = rsnn(sampledData, remainedData, sampledIndex, remainedIndex, 'ward') print 'ward end' max_nmi2 = -inf for ind2 in pop_ward: nmi2 = normalized_mutual_info_score(datalabels, ind2) if nmi2 > max_nmi2: max_nmi2 = nmi2 print '初始ward最大nmi为%s' % max_nmi2 pop_complete = rsnn(sampledData, remainedData, sampledIndex, remainedIndex, 'complete') print 'complete end' max_nmi3 = -inf for ind3 in pop_complete: nmi3 = normalized_mutual_info_score(datalabels, ind3) if nmi3 > max_nmi3: max_nmi3 = nmi3 print '初始complete最大nmi为%s' % max_nmi3 pop_average = rsnn(sampledData, remainedData, sampledIndex, remainedIndex, 'average') print 'average end' max_nmi4 = -inf for ind4 in pop_average: nmi4 = normalized_mutual_info_score(datalabels, ind4) if nmi4 > max_nmi4: max_nmi4 = nmi4 print '初始average最大nmi为%s' % max_nmi4 pop = [] pop.extend(pop_kmeans) pop.extend(pop_ward) pop.extend(pop_complete) pop.extend(pop_average) init_population = [] for indiv1 in pop: ind1 = creator.Individual(indiv1) init_population.append(ind1) filter_pop = filter(lambda x: len(x) > 0, init_population) ##去除初始化聚类失败的结果 population = filter_pop #population是总的种群,后续的交叉算法的结果也要添加进来 #为里第二个目标函数所用的矩阵,每个数据点的距离矩阵,计算一半 # dataLen = len(datamat) # eudataPointMatrix = zeros((dataLen, dataLen)) # for datai in range(dataLen): # for dataj in range(datai+1,dataLen): # eudataPointMatrix[datai][dataj] = Euclidean_dist(datamat[datai],datamat[dataj]) distances_matrix = pairwise_distances(datamat, metric='euclidean') # 数据集中数据点两两之间的距离 print "数据点距离矩阵计算完毕" invalid_ind = [ind for ind in population if not ind.fitness.valid] # fitnesses = toolbox.map(toolbox.evaluate, tile(datamat,(len(invalid_ind),1,1)),tile(distances_matrix,(len(invalid_ind),1,1)),invalid_ind) # for ind, fit in zip(invalid_ind, fitnesses): # ind.fitness.values = fit for ind1 in invalid_ind: euDistance1, eu_connect1 = mocle_index(datamat, distances_matrix, ind1) fitnesses1 = (euDistance1, eu_connect1) ind1.fitness.values = fitnesses1 # population = toolbox.select(population, len(population)) popeliteLen = len(population) for i in range(generation): print '第%s代' % i popElite = toolbox.select(population, popeliteLen) # Vary the population # parentSpring = tools.selTournamentDCD(popElite, popeliteLen) # parentSpring = [toolbox.clone(ind) for ind in parentSpring] newoffspring = [] # applying crossover subpopArr = getSubPop(popElite) count = 0 # 计数增加几个新个体用 for subpop in subpopArr: #dsce做交叉算子 # a1=0.6 # a2=0.5 # transMatrix, popClusterArr_3, popClusterArr_2, clusterNumArr = transformation(datamat, subpop) # similiarMatrix = measureSimilarity(transMatrix, popClusterArr_3, popClusterArr_2, # clusterNumArr, datamat, a1=a1) # dictCownP = assign(similiarMatrix, a2) # resultList = resultTransform(dictCownP, datamat) #其他聚类集成算子 hdf5_file_name = './Cluster_Ensembles.h5' fileh = tables.open_file(hdf5_file_name, 'w') fileh.create_group(fileh.root, 'consensus_group') fileh.close() subpop = np.array(subpop) hypergraph_adjacency = build_hypergraph_adjacency(subpop) store_hypergraph_adjacency(hypergraph_adjacency, hdf5_file_name) resultList = CE.CSPA(hdf5_file_name, subpop, verbose=True, N_clusters_max=3) resultList = list(resultList) clu = list(set(resultList)) clulen = len(clu) actual_resultList = [] if clulen > 1: ind_ensemble = creator.Individual(resultList) newoffspring.append(ind_ensemble) actual_resultList = resultList #只有簇的数量不是1才会有子个体 count += 1 if actual_resultList: predicted_clusternum = len(set(actual_resultList)) ind_new = KMeans( n_clusters=predicted_clusternum).fit_predict(datamat) ind_new_tran = creator.Individual(ind_new) newoffspring.append(ind_new_tran) count += 1 print "这一代增加里%s个个体" % count # evaluating fitness of individuals with invalid fitnesses invalid_ind = [ind for ind in newoffspring if not ind.fitness.valid] # fitnesses = toolbox.map(toolbox.evaluate, tile(datamat,(len(invalid_ind),1,1)),tile(distances_matrix,(len(invalid_ind),1,1)),invalid_ind)#这里只用了未经处理的数据,没有用到真实类别 # for ind, fit in zip(invalid_ind, fitnesses): # ind.fitness.values = fit for ind1 in invalid_ind: euDistance1, eu_connect1 = mocle_index(datamat, distances_matrix, ind1) fitnesses1 = (euDistance1, eu_connect1) ind1.fitness.values = fitnesses1 # Chossing a population for the next generation # population = toolbox.select(popElite + newoffspring, popeliteLen) population = popElite + newoffspring result1 = toolbox.nondominated(population, len(population)) ari_arr = [] max_ari = -inf for ind in result1[0]: ari = adjusted_rand_score(datalabels, ind) ari_arr.append(ari) if ari > max_ari: max_ari = ari nmi_arr = [] max_nmi = -inf print 'nmi值' for ind in result1[0]: nmi = normalized_mutual_info_score(datalabels, ind) nmi_arr.append(nmi) if nmi > max_nmi: max_nmi = nmi print '最大nmi值为:%s' % max_nmi print nmi_arr return max_nmi, max_ari