Exemplo n.º 1
0
def ensemble_crossover(population, index_arr):
    hdf5_file_name = './Cluster_Ensembles.h5'
    fileh = tables.open_file(hdf5_file_name, 'w')
    fileh.create_group(fileh.root, 'consensus_group')
    fileh.close()
    individuals = []  #用于交叉的父代个体的集合
    clusters_num = []
    # print int(round(len(population)*0.25))
    for i in range(20):
        individuals.append(tournament(population, index_arr))  #二进制锦标赛法选择出父代个体
    individuals = np.array(individuals)
    for j in range(len(individuals)):  #交叉产生的聚类簇的范围
        individual = individuals[j]
        aa = len(set(individual))
        clusters_num.append(aa)
    sort_clustersNum = sorted(
        clusters_num)  #sort对原list操作,但这里是set不能用sort(),只有用sorted()
    clusters_max = random.randint(sort_clustersNum[0],
                                  sort_clustersNum[-1] + 1)
    hypergraph_adjacency = build_hypergraph_adjacency(individuals)
    store_hypergraph_adjacency(hypergraph_adjacency, hdf5_file_name)
    consensus_clustering_labels = CE.MCLA(hdf5_file_name,
                                          individuals,
                                          verbose=True,
                                          N_clusters_max=clusters_max)
    ind_ensemble = creator.Individual(consensus_clustering_labels)
    print('交叉后的结果是:%s' % ind_ensemble)
    return ind_ensemble
Exemplo n.º 2
0
def lightlda_se_tsne(data, k, n_runs=10, init='basic', **se_params):
    print("lightlda_se_tsne")
    clusters = []
    tsne = TSNE(2)
    km = KMeans(k)
    n_runs = 3
    # Prepare LightLDA data
    prepare_lightlda_data(data, "lightlda_data/LightLDA_input")

    # Run LightLDA several times, and then find the consensus clusters
    for i in range(n_runs):
        m, w, ll = lightlda_estimate_state(data,
                                           k,
                                           prepare_data=False,
                                           **se_params)
        tsne_w = tsne.fit_transform(w.T)
        clust = km.fit_predict(tsne_w)
        clusters.append(clust)
    clusterings = np.vstack(clusters)
    consensus = CE.cluster_ensembles(clusterings,
                                     verbose=False,
                                     N_clusters_max=k)

    # Initialize a new LightlDA run with the consensus clusters
    init_m, init_w = nmf_init(data, consensus, k, 'basic')
    M, W, ll = lightlda_estimate_state(data,
                                       k,
                                       init_means=init_m,
                                       init_weights=init_w,
                                       prepare_data=False,
                                       **se_params)
    return M, W, ll
Exemplo n.º 3
0
def nmf_tsne(data, k, n_runs=10, init='enhanced', **params):
    """
    runs tsne-consensus-NMF

    1. run a bunch of NMFs, get W and H
    2. run tsne + km on all WH matrices
    3. run consensus clustering on all km results
    4. use consensus clustering as initialization for a new run of NMF
    5. return the W and H from the resulting NMF run
    """
    clusters = []
    nmf = NMF(k)
    tsne = TSNE(2)
    km = KMeans(k)
    for i in range(n_runs):
        w = nmf.fit_transform(data)
        h = nmf.components_
        tsne_wh = tsne.fit_transform(w.dot(h).T)
        clust = km.fit_predict(tsne_wh)
        clusters.append(clust)
    clusterings = np.vstack(clusters)
    consensus = CE.cluster_ensembles(clusterings,
                                     verbose=False,
                                     N_clusters_max=k)
    nmf_new = NMF(k, init='custom')
    # TODO: find an initialization for the consensus W and H
    init_w, init_h = nmf_init(data, consensus, k, init)
    W = nmf_new.fit_transform(data, W=init_w, H=init_h)
    H = nmf_new.components_
    return W, H
Exemplo n.º 4
0
def main():
    if not os.path.isdir("data/results"):
        os.makedirs("data/results")

    print("generating co affiliation matrix and graph")
    #co_affiliation = generate_affiliation_coaffiliation()
    co_affiliation = pd.read_hdf("data/results/co_affiliation.hdf")
    print(co_affiliation).shape
    #generate_graph(numpy_matrix = co_affiliation.to_numpy(), partition_name = "co_affiliation")

    # print("generating documents auteurs matrix")
    # generate_document_auteurs()

    # print("generating doc term matrix")
    # generate_doc_terms()

    # print("done")

    # print("Using all values")

    c_1 = np.genfromtxt('data/results/partition_co_affiliation.csv', delimiter=',')
    c_2 = np.genfromtxt('data/results/partition_co_term.csv', delimiter=',')
    c_3 = np.genfromtxt('data/results/partition_co_auteur.csv', delimiter=',')
    cluster_run = np.array([c_1, c_2, c_3])
    consensus_labels = CE.cluster_ensembles(cluster_run, N_clusters_max = 10)

    # np.savetxt("data/results/consensus.csv", consensus_labels, delimiter = ";")
    consensus_labels = np.genfromtxt("data/results/consensus.csv", delimiter = ";")
    consensus_labels = consensus_labels.astype(int)
    reorganise_graph_from_consensus(numpy_matrix = co_affiliation.to_numpy(), partition_name = "co_affiliation", consensus= consensus_labels)
Exemplo n.º 5
0
def main():
    datamat,datalabels = loadDataset("../dataset/lung-cancer.data")
    print 'data ready'

    sampledData, remainedData, sampledIndex, remainedIndex = data_sample(datamat,1,10)
    print 'sampledData ready'
    pop_kmeans = rsnn(sampledData, remainedData, sampledIndex, remainedIndex,'kmeans')
    print 'kmeans end'
    pop_ward = rsnn(sampledData, remainedData, sampledIndex, remainedIndex,'ward')
    print 'ward end'
    pop_complete = rsnn(sampledData, remainedData, sampledIndex, remainedIndex,'complete')
    print 'complete end'
    pop_average = rsnn(sampledData, remainedData, sampledIndex, remainedIndex,'average')
    print 'average end'
    pop = []
    pop.extend(pop_kmeans)
    pop.extend(pop_ward)
    pop.extend(pop_complete)
    pop.extend(pop_average)
    hdf5_file_name = './Cluster_Ensembles.h5'
    fileh = tables.open_file(hdf5_file_name, 'w')
    fileh.create_group(fileh.root, 'consensus_group')
    fileh.close()

    pop = np.array(pop)
    hypergraph_adjacency = build_hypergraph_adjacency(pop)
    store_hypergraph_adjacency(hypergraph_adjacency, hdf5_file_name)
    consensus_clustering_labels = CE.MCLA(hdf5_file_name, pop, verbose=True, N_clusters_max=10)
    nmi = normalized_mutual_info_score(datalabels, consensus_clustering_labels)
    ari = adjusted_rand_score(datalabels, consensus_clustering_labels)
    print('nmi值: ')
    print(nmi)
    print('ari值: ')
    print(ari)
Exemplo n.º 6
0
def run_cluster_ensembles(row_labels, number_of_classes, y):
    res = CE.cluster_ensembles(cluster_runs=row_labels,
                               N_clusters_max=number_of_classes)

    #nmis.append(nmi(res, y.ravel()))
    #aris.append(ari(res, y.ravel()))

    return res
Exemplo n.º 7
0
    def get(self, x):
        """
        Parameters
        __________
        x: array, shape (n_samples, n_features)
            The data array.

        Returns
        _______
        y: array, shape (n_samples,)
            List of labels that correspond to the best clustering k, as
            evaluated by eval_obj.

        """
        try:
            import Cluster_Ensembles as CE
        except ImportError:
            raise ImportError(
                "Manually install Cluster_Ensembles package if you "
                "wish to run ensemble clustering. For more information ",
                "see here: https://pypi.org/project/Cluster_Ensembles/")

        self.logger.info("Initializing Ensemble Clustering.")
        self.logger.info("Using the following methods:")
        self.logger.info(", ".join(self.methods))

        if len(self.methods) == 1:
            # No need to do ensemble if only one method
            return clu_wrap(self.methods[0])(eval_obj=self.eval_obj,
                                             n_clusters=self.n_clusters,
                                             n_jobs=self.n_jobs,
                                             **self.kwargs).get(x)
        elif len(self.methods) < 1:
            raise ValueError("No methods specified for ensemble clustering.")

        # initialize empty partition matrix
        partitions = np.zeros((len(self.methods), x.shape[0]))
        scores = []

        for i, method in enumerate(self.methods):
            clu_obj = clu_wrap(method)(eval_obj=self.eval_obj,
                                       n_clusters=self.n_clusters,
                                       n_jobs=self.n_jobs,
                                       **self.kwargs)
            partitions[i, :], score = clu_obj.get(x)
            scores.append(np.max(score))

        ensemble_labels = CE.cluster_ensembles(partitions.astype(np.int))

        return ensemble_labels, scores
Exemplo n.º 8
0
def get_cc_labels(N_samples, N_clusters, N_iterations, N_comparisons):

    possible_cluster_labels = get_partition_space(N_samples, N_clusters)

    cc_labels = np.empty((N_comparisons, N_samples), dtype = int)
    for i in xrange(N_comparisons):
        cluster_runs = get_cluster_runs(N_samples, N_iterations, possible_cluster_labels)
  
        with NamedTemporaryFile('w', suffix = '.h5', delete = True, dir = './') as f:
            fileh = tables.open_file(f.name, 'w')
            fileh.create_group(fileh.root, 'consensus_group')
            fileh.close()

            cc_labels[i] = CE.cluster_ensembles(cluster_runs, f.name, 
                                          N_clusters_max = N_clusters)

    return cc_labels
Exemplo n.º 9
0
def all_ensemble(population, k):
    hdf5_file_name = './Cluster_Ensembles.h5'
    fileh = tables.open_file(hdf5_file_name, 'w')
    fileh.create_group(fileh.root, 'consensus_group')
    fileh.close()
    pop = []
    for i in range(len(population)):
        ind = []
        ind.extend(population[i])
        pop.append(ind)
    pop = np.array(pop)
    hypergraph_adjacency = build_hypergraph_adjacency(pop)
    store_hypergraph_adjacency(hypergraph_adjacency, hdf5_file_name)
    consensus_clustering_labels = CE.MCLA(hdf5_file_name,
                                          pop,
                                          verbose=True,
                                          N_clusters_max=k + 2)
    return consensus_clustering_labels
Exemplo n.º 10
0
def get_cc_labels(N_samples, N_clusters, N_iterations, N_comparisons):

    possible_cluster_labels = get_partition_space(N_samples, N_clusters)

    cc_labels = np.empty((N_comparisons, N_samples), dtype=int)
    for i in range(N_comparisons):
        cluster_runs = get_cluster_runs(N_samples, N_iterations,
                                        possible_cluster_labels)

        with NamedTemporaryFile('w', suffix='.h5', delete=True, dir='./') as f:
            fileh = tables.open_file(f.name, 'w')
            fileh.create_group(fileh.root, 'consensus_group')
            fileh.close()

            cc_labels[i] = CE.cluster_ensembles(cluster_runs,
                                                f.name,
                                                N_clusters_max=N_clusters)

    return cc_labels
Exemplo n.º 11
0
def poisson_se_multiclust(data, k, n_runs=10, **se_params):
    """
    Initializes state estimation using a consensus of several
    fast clustering/dimensionality reduction algorithms.

    It does a consensus of 8 truncated SVD - k-means rounds, and uses the
    basic nmf_init to create starting points.
    """
    clusters = []
    norm_data = cell_normalize(data)
    if sparse.issparse(data):
        log_data = data.log1p()
        log_norm = norm_data.log1p()
    else:
        log_data = np.log1p(data)
        log_norm = np.log1p(norm_data)
    tsvd_50 = TruncatedSVD(50)
    tsvd_k = TruncatedSVD(k)
    km = KMeans(k)
    tsvd1 = tsvd_50.fit_transform(data.T)
    tsvd2 = tsvd_k.fit_transform(data.T)
    tsvd3 = tsvd_50.fit_transform(log_data.T)
    tsvd4 = tsvd_k.fit_transform(log_data.T)
    tsvd5 = tsvd_50.fit_transform(norm_data.T)
    tsvd6 = tsvd_k.fit_transform(norm_data.T)
    tsvd7 = tsvd_50.fit_transform(log_norm.T)
    tsvd8 = tsvd_k.fit_transform(log_norm.T)
    tsvd_results = [tsvd1, tsvd2, tsvd3, tsvd4, tsvd5, tsvd6, tsvd7, tsvd8]
    clusters = []
    for t in tsvd_results:
        clust = km.fit_predict(t)
        clusters.append(clust)
    clusterings = np.vstack(clusters)
    consensus = CE.cluster_ensembles(clusterings,
                                     verbose=False,
                                     N_clusters_max=k)
    init_m, init_w = nmf_init(data, consensus, k, 'basic')
    M, W, ll = poisson_estimate_state(data,
                                      k,
                                      init_means=init_m,
                                      init_weights=init_w,
                                      **se_params)
    return M, W, ll
Exemplo n.º 12
0
def poisson_consensus_se(data, k, n_runs=10, **se_params):
    """
    Initializes Poisson State Estimation using a consensus Poisson clustering.
    """
    clusters = []
    for i in range(n_runs):
        assignments, means = poisson_cluster(data, k)
        clusters.append(assignments)
    clusterings = np.vstack(clusters)
    consensus = CE.cluster_ensembles(clusterings,
                                     verbose=False,
                                     N_clusters_max=k)
    init_m, init_w = nmf_init(data, consensus, k, 'basic')
    M, W, ll = poisson_estimate_state(data,
                                      k,
                                      init_means=init_m,
                                      init_weights=init_w,
                                      **se_params)
    return M, W, ll
Exemplo n.º 13
0
def poisson_se_tsne(data, k, n_runs=10, init='basic', **se_params):
    """
    runs tsne-consensus-poissonSE
    """
    clusters = []
    tsne = TSNE(2)
    km = KMeans(k)
    for i in range(n_runs):
        m, w, ll = poisson_estimate_state(data, k, **se_params)
        tsne_w = tsne.fit_transform(w.T)
        clust = km.fit_predict(tsne_w)
        clusters.append(clust)
    clusterings = np.vstack(clusters)
    consensus = CE.cluster_ensembles(clusterings,
                                     verbose=False,
                                     N_clusters_max=k)
    init_m, init_w = nmf_init(data, consensus, k, 'basic')
    M, W, ll = poisson_estimate_state(data,
                                      k,
                                      init_means=init_m,
                                      init_weights=init_w,
                                      **se_params)
    return M, W, ll
Exemplo n.º 14
0
def cooperative_cluster(data, feature_method, limit=0, nclusters=16):
    cluster_runs = []
    for d in data:
        classifier = pickle_load(d)
        clusterlist, n = extract_complete_clusterlist(classifier,
                                                      feature_method)
        clusterlist.sort(key=lambda c: c[0])
        images = [c[0] for c in clusterlist]
        labels = [c[1] for c in clusterlist]
        cluster_runs.append(labels)

    cluster_runs = numpy.asarray(cluster_runs)
    for i in range(cluster_runs.shape[0]):
        for j in range(cluster_runs.shape[1]):
            cluster_runs[i, j] = int(cluster_runs[i, j])
    # Cluster run is an array shape (M,N), being M number of clustering methods and N number of samples
    if not limit == 0:
        cluster_runs = cluster_runs[:, :limit]
    consensus_labels = CE.cluster_ensembles(cluster_runs,
                                            verbose=True,
                                            N_clusters_max=nclusters)
    clusterlist = [(images[i], consensus_labels[i])
                   for i in range(len(consensus_labels))]
    return clusterlist
Exemplo n.º 15
0
def consistency_selection(nidpath,
                          pospath,
                          respath,
                          savepath,
                          mlset,
                          nlset,
                          distype='nid',
                          cutoff=0.9):
    """

    :param nidpath:
    :param pospath:
    :param respath:
    :param savepath:
    :param mlset:
    :param nlset:
    :param distype:
    :param cutoff:
    :return:
    """
    nidpath = os.path.expanduser(nidpath)
    for f in os.listdir(nidpath):
        if f.startswith('.'):
            continue
        fullpath = os.path.join(nidpath, f)
        if os.path.isfile(fullpath):
            fname = os.path.splitext(f)
            filename = fname[0].split('_' + distype)[0]
            dataset_name = filename.split('_')[0]
            if not os.path.isdir(savepath + dataset_name):
                os.mkdir(savepath + dataset_name)

            # read distance matrix, position matrix and label matrix from external file
            # note that the last 4 rows / cols are naive consensus & real labels
            distanceMatrix = np.loadtxt(fullpath, delimiter=',')
            pos = np.loadtxt(pospath + filename + '_mds2d.txt', delimiter=',')
            labels = np.loadtxt(respath + filename + '.res', delimiter=',')
            labels = labels.astype(int)

            # real labels store in the last row
            target = labels[-1]
            class_num = len(np.unique(target))

            # do mst clustering, we assume that there should be more than 5 solutions in each cluster
            mstmodel = MSTClustering(cutoff=cutoff,
                                     min_cluster_size=5,
                                     metric='precomputed')
            mstmodel.fit(distanceMatrix[0:-4, 0:-4])

            # compute average consistency of each cluster of solutions
            avg_cons = Metrics.average_consistency(mstmodel.labels_,
                                                   labels[0:-4], mlset, nlset)

            # find the cluster of solution with largest consistency
            maxclu = 0
            max_cons = 0.0
            print(avg_cons)
            for clu, cons in avg_cons.iteritems():
                if clu == -1:
                    continue
                if cons > max_cons:
                    maxclu = clu
                    max_cons = cons

            # do consensus, note that last 4 rows should be skipped
            cluster_labels = labels[0:-4][mstmodel.labels_ == maxclu]
            labels_CSPA = ce.cluster_ensembles_CSPAONLY(
                cluster_labels, N_clusters_max=class_num)
            labels_HGPA = ce.cluster_ensembles_HGPAONLY(
                cluster_labels, N_clusters_max=class_num)
            labels_MCLA = ce.cluster_ensembles_MCLAONLY(
                cluster_labels, N_clusters_max=class_num)

            # print labels and diversities (between the real labels)
            nmi_CSPA = 1 - Metrics.diversityBtw2Cluster(labels_CSPA, target)
            nmi_HGPA = 1 - Metrics.diversityBtw2Cluster(labels_HGPA, target)
            nmi_MCLA = 1 - Metrics.diversityBtw2Cluster(labels_MCLA, target)
            print('consensus result diversity (CSPA) =' + str(nmi_CSPA))
            print('consensus diversity (HGPA) =' + str(nmi_HGPA))
            print('consensus diversity (MCLA) =' + str(nmi_MCLA))

            # store visualization file using 2d-MDS
            fig = plt.figure(1)
            plt.clf()
            clusters = np.unique(mstmodel.labels_)
            for i in clusters:
                xs = pos[0:-4][mstmodel.labels_ == i, 0]
                ys = pos[0:-4][mstmodel.labels_ == i, 1]
                ax = plt.axes([0., 0., 1., 1.])
                if i == -1:
                    plt.scatter(xs,
                                ys,
                                c=_colors[((int(i) + 1) % len(_colors))],
                                label='Outliers')
                elif i == maxclu:
                    plt.scatter(xs,
                                ys,
                                c=_colors[((int(i) + 1) % len(_colors))],
                                marker='*',
                                label='Selected')
                else:
                    plt.scatter(xs,
                                ys,
                                c=_colors[((int(i) + 1) % len(_colors))],
                                label='Clusters-' + str(i))

            plt.scatter(pos[-4:-1, 0],
                        pos[-4:-1, 1],
                        c='blue',
                        marker='D',
                        label='Consensus')
            plt.scatter(pos[-1:, 0],
                        pos[-1:, 1],
                        c='red',
                        marker='D',
                        label='Real')
            plt.legend(loc='best', shadow=True)
            plt.savefig(savepath + dataset_name + '/' + filename +
                        '_afterMST_selection_' + str(cutoff) + '.png',
                        format='png',
                        dpi=240)

    return
Exemplo n.º 16
0
def main():
    # init_population,init_ari,datamat,datalabels = ini_Cluster(kNumber=6) #多种聚类算法产生初始种群
    datamat, datalabels = loadDataset("../dataset/soybean-small.data")
    print 'data ready'

    pop_kmeans = initialMultiRun(datamat, 10, 'kmeans')
    print 'kmeans end'
    pop_ward = initialMultiRun(datamat, 10, 'ward')
    print 'ward end'
    pop_complete = initialMultiRun(datamat, 10, 'complete')
    print 'complete end'
    pop_average = initialMultiRun(datamat, 10, 'average')
    print 'average end'
    pop = []
    pop.extend(pop_kmeans)
    pop.extend(pop_ward)
    pop.extend(pop_complete)
    pop.extend(pop_average)

    init_population = []
    for indiv1 in pop:
        ind1 = creator.Individual(indiv1)
        init_population.append(ind1)

    filter_pop = filter(lambda x: len(x) > 0, init_population)  ##去除初始化聚类失败的结果
    population = filter_pop  #population是总的种群,后续的交叉算法的结果也要添加进来

    invalid_ind = [ind for ind in population if not ind.fitness.valid]
    fitnesses = toolbox.map(toolbox.evaluate,
                            tile(datamat, (len(invalid_ind), 1, 1)),
                            tile(population, (len(invalid_ind), 1, 1)),
                            invalid_ind)
    for ind, fit in zip(invalid_ind, fitnesses):
        ind.fitness.values = fit

    population = toolbox.select(population, len(population))

    for i in range(generation):
        print '第%s代' % i
        popElite = toolbox.select(population, int(round(
            len(population) * 0.5)))  #top half from population

        # Vary the population
        parentSpring = tools.selTournamentDCD(population, len(population))
        parentSpring = [toolbox.clone(ind) for ind in parentSpring]
        newoffspring = []
        # applying crossover
        for indiv1, indiv2 in zip(parentSpring[::2], parentSpring[1::2]):
            randNum = random.random()  # generate a random number from 0 to 1
            if randNum < 0.8:
                toolbox.mate(indiv1, indiv2)
                toolbox.mutate(indiv1)
                toolbox.mutate(indiv2)
                del indiv1.fitness.values, indiv2.fitness.values
                newoffspring.append(indiv1)
                newoffspring.append(indiv2)
            else:
                hdf5_file_name = './Cluster_Ensembles.h5'
                fileh = tables.open_file(hdf5_file_name, 'w')
                fileh.create_group(fileh.root, 'consensus_group')
                fileh.close()
                individuals = []
                individuals.append(indiv1)
                individuals.append(indiv2)
                individuals = np.array(individuals)
                hypergraph_adjacency = build_hypergraph_adjacency(individuals)
                store_hypergraph_adjacency(hypergraph_adjacency,
                                           hdf5_file_name)
                consensus_clustering_labels = CE.MCLA(hdf5_file_name,
                                                      individuals,
                                                      verbose=True,
                                                      N_clusters_max=10)
                ind_ensemble = creator.Individual(consensus_clustering_labels)
                newoffspring.append(ind_ensemble)

        # evaluating fitness of individuals with invalid fitnesses
        invalid_ind = [ind for ind in newoffspring if not ind.fitness.valid]
        fitnesses = toolbox.map(toolbox.evaluate,
                                tile(datamat, (len(invalid_ind), 1, 1)),
                                tile(newoffspring, (len(invalid_ind), 1, 1)),
                                invalid_ind)  #这里只用了未经处理的数据,没有用到真实类别
        for ind, fit in zip(invalid_ind, fitnesses):
            ind.fitness.values = fit

        # Chossing a population for the next generation
        population = toolbox.select(popElite + newoffspring, len(population))
    result1 = toolbox.nondominated(population, len(population))
    print len(result1)
    print result1
    print len(result1[0])
    print result1[0]
    print 'ari值'
    ari_arr = []
    max_ari = -inf
    for ind in result1[0]:
        ari = adjusted_rand_score(datalabels, ind)
        ari_arr.append(ari)
        if ari > max_ari:
            max_ari = ari
    print ari_arr
    print max_ari
    nmi_arr = []
    max_nmi = -inf
    print 'nmi值'
    for ind in result1[0]:
        nmi = normalized_mutual_info_score(datalabels, ind)
        nmi_arr.append(nmi)
        if nmi > max_nmi:
            max_nmi = nmi
    print nmi_arr
    print max_nmi
Exemplo n.º 17
0
def autoGenerationWithConsensus(dataSets,
                                paramSettings,
                                verbose=True,
                                path='Results/',
                                checkDiversity=True,
                                metric='nid',
                                manifold_type='MDS',
                                subfolder=False):
    """
    generate ensemble members with consensus (CSPA, HGPA, MCLA) automatically

    Parameters
    ----------
    :param dataSets: a dictionary that keys are dataset names and values are corresponding load methods
    :param paramSettings: a nested dictionary that keys are dataset names and values are a dictionary containing params
    :param verbose: whether to output the debug information
    :param path: path to store the result matrix
    :param checkDiversity: whether to check the diversity
    :param metric: which, should be either 'diversity' or 'NID'.
    :param manifold_type: which method of manifold transformation used to visualize, only 'MDS' is supported now.
    :param subfolder: whether to save the results into a sub-folder named by names (they should be created manually)

    Returns
    -------
    :return:
    """

    if not os.path.isdir(path):
        os.mkdir(path)

    for name, dataset in dataSets.iteritems():

        print 'start generating dataset:' + name

        if subfolder:
            savepath = path + name + '/'
            if not os.path.isdir(savepath):
                os.mkdir(savepath)
        else:
            savepath = path

        # get the dataset by load method
        data, target = dataset()

        # member and classnum must be defined in paramSettings
        n_members = paramSettings[name]['members']
        class_num = paramSettings[name]['classNum']

        # default values of A B FSR SSR
        s_Clusters = class_num
        l_Clusters = class_num * 10
        FSR = 1
        SSR = 0.7
        FSR_l = 0.05
        SSR_l = 0.1
        sampling_method = 'FSRSNN'

        if 'method' in paramSettings[name]:
            sampling_method = paramSettings[name]['method']
            if sampling_method not in _sampling_methods.keys():
                raise ValueError(
                    'ensemble generation : Method should be either \'FSRSNN\' or \'FSRSNC\''
                )

        if 'constraints' in paramSettings[name]:
            constraints_file = paramSettings[name]['constraints']
            mlset, nlset = gcl.read_constraints(constraints_file)
        else:
            constraints_file = ''
            mlset = []
            nlset = []

        # get parameters from dictionary if available
        if 'FSR' in paramSettings[name]:
            FSR = paramSettings[name]['FSR']
        if 'SSR' in paramSettings[name]:
            SSR = paramSettings[name]['SSR']
        if 'FSR_L' in paramSettings[name]:
            FSR_l = paramSettings[name]['FSR_L']
        if 'SSR_L' in paramSettings[name]:
            SSR_l = paramSettings[name]['SSR_L']

        if 'small_Clusters' in paramSettings[
                name] and 'large_Clusters' in paramSettings[name]:
            s_Clusters = int(paramSettings[name]['small_Clusters'])
            l_Clusters = int(paramSettings[name]['large_Clusters'])

        f_stable_sample = True
        s_stable_sample = True
        if 'F_STABLE' in paramSettings[name]:
            f_stable_sample = paramSettings[name]['F_STABLE']
        if 'S_STABLE' in paramSettings[name]:
            s_stable_sample = paramSettings[name]['S_STABLE']

        if FSR_l > FSR:
            FSR_l = FSR / 2
        if SSR_l > SSR:
            SSR_l = SSR / 2

        # there should be at least 2 clusters in the clustering
        if s_Clusters < 2:
            s_Clusters = 2
        if l_Clusters < s_Clusters:
            l_Clusters = s_Clusters

        tag = True

        # matrix to store clustering results
        mat = np.empty(data.shape[0])

        # generate ensemble members
        for i in range(0, n_members):
            # determine k randomly
            cluster_num = np.random.randint(s_Clusters, l_Clusters + 1)
            random_state = np.random.randint(0, 2147483647 - 1)

            cur_FSR = FSR
            cur_SSR = SSR
            if not f_stable_sample:
                cur_FSR = rand.uniform(FSR_l, FSR)
            if not s_stable_sample:
                cur_SSR = rand.uniform(SSR_l, SSR)

            # generate ensemble member by given method
            result = _sampling_methods[sampling_method](data,
                                                        target,
                                                        r_clusters=cluster_num,
                                                        r_state=random_state,
                                                        fsr=cur_FSR,
                                                        ssr=cur_SSR)
            # print diversity
            diver = Metrics.normalized_max_mutual_info_score(result, target)
            if verbose:
                print 'Member' + str(
                    i) + ' diversity between real labels = ' + str(diver)
            # stack the result into the matrix
            if tag:
                mat = np.array(result)
                mat = np.reshape(mat, (1, data.shape[0]))
                tag = False
            else:
                temp = np.array(result)
                temp = np.reshape(temp, (1, data.shape[0]))
                mat = np.vstack([mat, np.array(temp)])

        # change element type to int for consensus
        mat = mat.astype(int)

        clf = cluster.KMeans(n_clusters=class_num)
        clf.fit(data)
        kmlabels = clf.labels_

        # do consensus
        labels_CSPA = ce.cluster_ensembles_CSPAONLY(mat,
                                                    N_clusters_max=class_num)
        labels_HGPA = ce.cluster_ensembles_HGPAONLY(mat,
                                                    N_clusters_max=class_num)
        labels_MCLA = ce.cluster_ensembles_MCLAONLY(mat,
                                                    N_clusters_max=class_num)

        if verbose:
            print 'Consensus results:'
            print labels_CSPA
            print labels_HGPA
            print labels_MCLA

        # put consensus results into the matrix
        mat = np.vstack([mat, np.reshape(kmlabels, (1, data.shape[0]))])
        mat = np.vstack([mat, np.reshape(labels_CSPA, (1, data.shape[0]))])
        mat = np.vstack([mat, np.reshape(labels_HGPA, (1, data.shape[0]))])
        mat = np.vstack([mat, np.reshape(labels_MCLA, (1, data.shape[0]))])

        # put real labels into the matrix
        temp = np.reshape(target, (1, data.shape[0]))
        mat = np.vstack([mat, np.array(temp)])

        # path and filename to write the file
        filename = _get_file_name(name, s_Clusters, l_Clusters, FSR, FSR_l,
                                  SSR, SSR_l, n_members, f_stable_sample,
                                  s_stable_sample, sampling_method)
        print 'Dataset ' + name + ', consensus finished, results are saving to file : ' + filename

        # write results to external file, use %d to keep integer part only
        np.savetxt(savepath + filename + '.res', mat, fmt='%d', delimiter=',')

        if checkDiversity:

            # print labels and diversities (between the real labels)
            nmi_CSPA = Metrics.normalized_max_mutual_info_score(
                labels_CSPA, target)
            nmi_HGPA = Metrics.normalized_max_mutual_info_score(
                labels_HGPA, target)
            nmi_MCLA = Metrics.normalized_max_mutual_info_score(
                labels_MCLA, target)
            print 'consensus result diversity (CSPA) =' + str(nmi_CSPA)
            print 'consensus diversity (HGPA) =' + str(nmi_HGPA)
            print 'consensus diversity (MCLA) =' + str(nmi_MCLA)

            kmnmi = Metrics.normalized_max_mutual_info_score(kmlabels, target)
            print 'single-model diversity (K-means) =' + str(kmnmi)
            if metric == 'diversity':
                distance_matrix = Metrics.diversityMatrix(mat)
                np.savetxt(savepath + filename + '_diversity.txt',
                           distance_matrix,
                           delimiter=',')
            else:
                distance_matrix = Metrics.NIDMatrix(mat)
                np.savetxt(savepath + filename + '_nid.txt',
                           distance_matrix,
                           delimiter=',')

            # save performances
            perf = np.array([nmi_CSPA, nmi_HGPA, nmi_MCLA, kmnmi])
            np.savetxt(savepath + filename + '_performance.txt',
                       perf,
                       fmt='%.6f',
                       delimiter=',')

        if manifold_type == 'MDS':
            # transform distance matrix into 2-d or 3-d coordinates to visualize
            mds2d = manifold.MDS(n_components=2,
                                 max_iter=10000,
                                 eps=1e-12,
                                 dissimilarity='precomputed')
            mds3d = manifold.MDS(n_components=3,
                                 max_iter=10000,
                                 eps=1e-12,
                                 dissimilarity='precomputed')
            pos2d = mds2d.fit(distance_matrix).embedding_
            pos3d = mds3d.fit(distance_matrix).embedding_
            np.savetxt(savepath + filename + '_mds2d.txt',
                       pos2d,
                       fmt="%.6f",
                       delimiter=',')
            np.savetxt(savepath + filename + '_mds3d.txt',
                       pos3d,
                       fmt="%.6f",
                       delimiter=',')

            cv.draw_ordered_distance_matrix(
                distance_matrix,
                savepath + filename + '_original_distance.png',
                savepath + filename + '_odm.png')
            cv.plot_k_distribution(mat, pos2d,
                                   savepath + filename + '_k_distribution.png')
            if constraints_file != '':
                cv.plot_consistency(mat,
                                    pos2d,
                                    mlset,
                                    nlset,
                                    savepath + filename +
                                    '_consistency_both.png',
                                    consistency_type='both')
                cv.plot_consistency(mat,
                                    pos2d,
                                    mlset,
                                    nlset,
                                    savepath + filename +
                                    '_consistency_must.png',
                                    consistency_type='must')
                cv.plot_consistency(mat,
                                    pos2d,
                                    mlset,
                                    nlset,
                                    savepath + filename +
                                    '_consistency_cannot.png',
                                    consistency_type='cannot')
    return
Exemplo n.º 18
0
def all_cluster_consensus(nidpath, respath, distype='nid', cutoff=0.9):
    """

    :param nidpath:
    :param respath:
    :param distype:
    :param cutoff:
    :return:
    """
    nidpath = os.path.expanduser(nidpath)
    for f in os.listdir(nidpath):
        if f.startswith('.'):
            continue
        fullpath = os.path.join(nidpath, f)
        if os.path.isfile(fullpath):
            fname = os.path.splitext(f)
            filename = fname[0].split('_' + distype)[0]
            dataset_name = filename.split('_')[0]

            # read distance matrix, position matrix and label matrix from external file
            # note that the last 4 rows / cols are naive consensus & real labels
            print(fullpath)
            distanceMatrix = np.loadtxt(fullpath, delimiter=',')
            labels = np.loadtxt(respath + filename + '.res', delimiter=',')
            labels = labels.astype(int)

            # real labels store in the last row
            target = labels[-1]
            class_num = len(np.unique(target))

            # do mst clustering, we assume that there should be more than 5 solutions in each cluster
            mstmodel = MSTClustering(cutoff=cutoff,
                                     min_cluster_size=5,
                                     metric='precomputed')
            mstmodel.fit(distanceMatrix[0:-4, 0:-4])

            clusters = np.unique(mstmodel.labels_)
            for i in clusters:
                # do consensus, note that last 4 rows should be skipped
                cluster_labels = labels[0:-4][mstmodel.labels_ == i]
                labels_CSPA = ce.cluster_ensembles_CSPAONLY(
                    cluster_labels, N_clusters_max=class_num)
                labels_HGPA = ce.cluster_ensembles_HGPAONLY(
                    cluster_labels, N_clusters_max=class_num)
                labels_MCLA = ce.cluster_ensembles_MCLAONLY(
                    cluster_labels, N_clusters_max=class_num)

                # print labels and diversities (between the real labels)
                nmi_CSPA = 1 - Metrics.diversityBtw2Cluster(
                    labels_CSPA, target)
                nmi_HGPA = 1 - Metrics.diversityBtw2Cluster(
                    labels_HGPA, target)
                nmi_MCLA = 1 - Metrics.diversityBtw2Cluster(
                    labels_MCLA, target)
                print('Cluster ' + str(i) +
                      '===========================================')
                print('consensus result diversity (CSPA) =' + str(nmi_CSPA))
                print('consensus diversity (HGPA) =' + str(nmi_HGPA))
                print('consensus diversity (MCLA) =' + str(nmi_MCLA))

    return
Exemplo n.º 19
0
    nx.draw_networkx_nodes(G, pos, list_nodes, node_size = 20, node_color = (colors[count]))
    count += 1
nx.draw_networkx_edges(G, pos, alpha=0.5)
plt.axis("off")
plt.show

clustering_1 = [0,1,1,2,0,2,1,0,2,1]
clustering_2 = [0,1,2,0,0,2,2,1,2,1]
clustering_3 = [2,0,0,2,1,1,1,0,1,2]
cluster_runs = np.array([clustering_1, clustering_2, clustering_3])
print("cluster_runs", cluster_runs)




consensus_clustering_labels = CE.cluster_ensembles(cluster_runs, verbose = True, N_clusters_max = 3) 
print("consensus_clustering_labels", consensus_clustering_labels)

import pandas as pd 
  
  # reading csv file  
article = pd.read_csv("article.csv", sep="\t")
auteur =  pd.read_csv("auteur.csv", sep = ";")

article.describe()

article.groupby('year').agg('count')

#Supprimer les lignes qui n'ont pas d'abstract
article['abstract'].replace('',np.nan, inplace=True)
article.dropna(subset=['abstract'], inplace = True)
Exemplo n.º 20
0
    fig, ax = plt.subplots(n, n, figsize=(10, 10), sharex=True, sharey=True)
    for i in range(n):
        for j in range(n):
            if i + n * j >= len(segmentations):
                break
            segment = segmentations[i + n * j]
            ax[j, i].imshow(mark_boundaries(image, segment))

    for a in ax.ravel():
        a.set_axis_off()

    plt.tight_layout()
    plt.show()


print("Calculating segmentations")
segmentations = calculate_segmentations(img)

print("Combining clusterings")
cluster_runs = np.asarray(list(map(lambda s: s.flatten(), segmentations)))
consensus = ce.cluster_ensembles(cluster_runs, verbose=True, N_clusters_max=30)
consensus = consensus.reshape(segmentations[0].shape)

print("Drawing results")
draw_segmentations(segmentations, img)

plt.figure(dpi=300)
plt.imshow(mark_boundaries(img, consensus))
plt.axis('off')
plt.show()
Exemplo n.º 21
0
def consensus_clustering():
    cluster_run = np.array([c_1, c_2, c_3])
    consensus_labels = CE.cluster_ensembles(cluster_run, N_clusters_max=3)
Exemplo n.º 22
0
def moclenew(datamat):
    # datamat,datalabels = loadDataset("../dataset/glass.data")
    print 'data ready'
    pop_kmeans = ini_population(datamat, 'kmeans', 10)
    print 'kmeans end'
    pop_ward = ini_population(datamat, 'ward', 10)
    print 'ward end'
    pop_complete = ini_population(datamat, 'complete', 10)
    print 'complete end'
    pop_average = ini_population(datamat, 'average', 10)
    print 'average end'
    # pop_spc = ini_population(datamat, 'spc', 1)
    # print 'spc end'
    pop = []
    pop.extend(pop_kmeans)
    pop.extend(pop_complete)
    pop.extend(pop_average)
    # pop.extend(pop_spc)
    init_population = []
    for indiv1 in pop:
        ind1 = creator.Individual(indiv1)
        init_population.append(ind1)

    filter_pop = filter(lambda x: len(x) > 0, init_population)  ##去除初始化聚类失败的结果
    population = filter_pop  #population是总的种群,后续的交叉算法的结果也要添加进来

    #为里第二个目标函数所用的矩阵,每个数据点的距离矩阵,计算一半
    # dataLen = len(datamat)
    # distances_matrix = zeros((dataLen, dataLen))
    # for datai in range(dataLen):
    #     for dataj in range(datai+1,dataLen):
    #         distances_matrix[datai][dataj] = Euclidean_dist(datamat[datai],datamat[dataj])
    distances_matrix = pairwise_distances(datamat,
                                          metric='euclidean')  # 数据集中数据点两两之间的距离
    print "数据点距离矩阵计算完毕"
    invalid_ind = [ind for ind in population if not ind.fitness.valid]
    for ind in invalid_ind:
        euDistance, eu_connect = mocle_index(datamat, distances_matrix, ind)
        fitnesses = (euDistance, eu_connect)
        ind.fitness.values = fitnesses
    # fitnesses = toolbox.map(toolbox.evaluate, tile(datamat,(len(invalid_ind),1,1)),tile(distances_matrix,(len(invalid_ind),1,1)),invalid_ind)
    #
    # for ind, fit in zip(invalid_ind, fitnesses):
    #     ind.fitness.values = fit

    # population = toolbox.select(population, len(population))
    popeliteLen = len(population)
    for i in range(generation):
        print '第%s代' % i
        popElite = toolbox.select(population, popeliteLen)
        # Vary the population
        # parentSpring = tools.selTournamentDCD(popElite, popeliteLen)
        # parentSpring = [toolbox.clone(ind) for ind in parentSpring]
        newoffspring = []
        # applying crossover
        popcrossover = toolbox.select(population, 2)

        k1 = len(list(set(popcrossover[0])))
        k2 = len(list(set(popcrossover[1])))
        if k1 <= k2:
            k = random.randint(k1, k2 + 1)
        else:
            k = random.randint(k2, k1 + 1)
        # 其他聚类集成算子
        hdf5_file_name = './Cluster_Ensembles.h5'
        fileh = tables.open_file(hdf5_file_name, 'w')
        fileh.create_group(fileh.root, 'consensus_group')
        fileh.close()
        popcrossover = np.array(popcrossover)
        hypergraph_adjacency = build_hypergraph_adjacency(popcrossover)
        store_hypergraph_adjacency(hypergraph_adjacency, hdf5_file_name)
        resultList = CE.MCLA(hdf5_file_name,
                             popcrossover,
                             verbose=True,
                             N_clusters_max=k)
        ind_ensemble = creator.Individual(resultList)
        newoffspring.append(ind_ensemble)

        # evaluating fitness of individuals with invalid fitnesses
        invalid_ind = [ind for ind in newoffspring if not ind.fitness.valid]
        for ind1 in invalid_ind:
            euDistance1, eu_connect1 = mocle_index(datamat, distances_matrix,
                                                   ind1)
            fitnesses1 = (euDistance1, eu_connect1)
            ind1.fitness.values = fitnesses1

        # fitnesses = toolbox.map(toolbox.evaluate, tile(datamat,(len(invalid_ind),1,1)),tile(distances_matrix,(len(invalid_ind),1,1)),invalid_ind)#这里只用了未经处理的数据,没有用到真实类别
        #
        # for ind, fit in zip(invalid_ind, fitnesses):
        #     ind.fitness.values = fit

        # Chossing a population for the next generation
        # population = toolbox.select(popElite + newoffspring, popeliteLen)
        population = popElite + newoffspring
    result1 = toolbox.nondominated(population, len(population))
    nondominated_result = result1[0]
    final_result, pbmValue = computePBM(datamat, nondominated_result)
    return final_result, pbmValue
Exemplo n.º 23
0
def run_experiment(methods,
                   data,
                   n_classes,
                   true_labels,
                   n_runs=10,
                   use_purity=True,
                   use_nmi=False,
                   use_ari=False,
                   use_nne=False,
                   consensus=False):
    """
    runs a pre-processing + clustering experiment...

    exactly one of use_purity, use_nmi, or use_ari can be true

    Args:
        methods: list of 2-tuples. The first element is either a single Preprocess object or a list of Preprocess objects, to be applied in sequence to the data. The second element is either a single Cluster object, a list of Cluster objects, or a list of lists, where each list is a sequence of Preprocess objects with the final element being a Cluster object.
        data: genes x cells array
        true_labels: 1d array of length cells
        consensus: if true, runs a consensus on cluster results for each method at the very end.
        use_purity, use_nmi, use_ari, use_nne: which error metric to use (at most one can be True)

    Returns:
        purities (list of lists)
        names (list of lists)
        other (dict): keys: timing, preprocessing, clusterings
    """
    results = []
    names = []
    clusterings = {}
    other_results = {}
    other_results['timing'] = {}
    other_results['preprocessing'] = {}
    if use_purity:
        purity_method = purity
    elif use_nmi:
        purity_method = nmi
    elif use_ari:
        purity_method = ari
    elif use_nne:
        purity_method = nne
    for i in range(n_runs):
        print('run {0}'.format(i))
        purities = []
        r = 0
        method_index = 0
        for preproc, cluster in methods:
            t0 = time.time()
            if isinstance(preproc, Preprocess):
                preprocessed, ll = preproc.run(data)
                output_names = preproc.output_names
            else:
                # if the input is a list, only use the first preproc result
                p1 = data
                output_names = ['']
                for p in preproc:
                    p1, ll = p.run(p1)
                    p1 = p1[0]
                    if output_names[0] != '':
                        output_names[
                            0] = output_names[0] + '_' + p.output_names[0]
                    else:
                        output_names[0] = p.output_names[0]
                preprocessed = [p1]
            t1 = time.time() - t0
            for name, pre in zip(output_names, preprocessed):
                starting_index = method_index
                if isinstance(cluster, Cluster):
                    #try:
                    t0 = time.time()
                    labels = cluster.run(pre)
                    t2 = t1 + time.time() - t0
                    if use_nne:
                        purities.append(purity_method(pre, true_labels))
                    else:
                        purities.append(purity_method(labels, true_labels))
                    if i == 0:
                        names.append(name + '_' + cluster.name)
                        clusterings[names[-1]] = []
                        other_results['timing'][names[-1]] = []
                    print(names[r])
                    clusterings[names[r]].append(labels)
                    print('time: ' + str(t2))
                    other_results['timing'][names[r]].append(t2)
                    print(purities[-1])
                    r += 1
                    method_index += 1
                #except:
                #    print('failed to do clustering')
                elif type(cluster) == list:
                    for c in cluster:
                        if isinstance(c, list):
                            t2 = t1
                            name2 = name
                            sub_data = pre.copy()
                            for subproc in c[:-1]:
                                t0 = time.time()
                                subproc_out, ll = subproc.run(sub_data)
                                sub_data = subproc_out[0]
                                name2 = name2 + '_' + subproc.output_names[0]
                                t2 += time.time() - t0
                            t0 = time.time()
                            labels = c[-1].run(sub_data)
                            t2 += time.time() - t0
                            if use_nne:
                                purities.append(
                                    purity_method(sub_data, true_labels))
                            else:
                                purities.append(
                                    purity_method(labels, true_labels))
                            if i == 0:
                                names.append(name2 + '_' + c[-1].name)
                                clusterings[names[-1]] = []
                                other_results['timing'][names[-1]] = []
                            print(names[r])
                            clusterings[names[r]].append(labels)
                            other_results['timing'][names[r]].append(t2)
                            print('time: ' + str(t2))
                            print(purities[-1])
                            r += 1
                            method_index += 1
                        else:
                            try:
                                t0 = time.time()
                                labels = c.run(pre)
                                t2 = t1 + time.time() - t0
                                if i == 0:
                                    names.append(name + '_' + c.name)
                                    clusterings[names[-1]] = []
                                    other_results['timing'][names[-1]] = []
                                if use_nne:
                                    purities.append(
                                        purity_method(pre, true_labels))
                                else:
                                    purities.append(
                                        purity_method(labels, true_labels))
                                print(names[r])
                                clusterings[names[r]].append(labels)
                                other_results['timing'][names[r]].append(t2)
                                print('time: ' + str(t2))
                                print(purities[-1])
                                r += 1
                                method_index += 1
                            except:
                                print('failed to do clustering')
                # find the highest purity for the pre-processing method
                # save the preprocessing result with the highest NMI
                num_clustering_results = method_index - starting_index
                clustering_results = purities[-num_clustering_results:]
                if i > 0 and len(clustering_results) > 0:
                    old_clustering_results = results[-1][
                        starting_index:method_index]
                    if max(old_clustering_results) < max(clustering_results):
                        other_results['preprocessing'][name] = pre
                else:
                    other_results['preprocessing'][name] = pre
        print('\t'.join(names))
        print('purities: ' + '\t'.join(map(str, purities)))
        results.append(purities)
    consensus_purities = []
    if consensus:
        other_results['consensus'] = {}
        k = len(np.unique(true_labels))
        for name, clusts in clusterings.items():
            print(name)
            clusts = np.vstack(clusts)
            consensus_clust = CE.cluster_ensembles(clusts,
                                                   verbose=False,
                                                   N_clusters_max=k)
            other_results['consensus'][name] = consensus_clust
            if use_purity:
                consensus_purity = purity(consensus_clust.flatten(),
                                          true_labels)
                print('consensus purity: ' + str(consensus_purity))
                consensus_purities.append(consensus_purity)
            if use_nmi:
                consensus_nmi = nmi(true_labels, consensus_clust)
                print('consensus NMI: ' + str(consensus_nmi))
                consensus_purities.append(consensus_nmi)
            if use_ari:
                consensus_ari = ari(true_labels, consensus_clust)
                print('consensus ARI: ' + str(consensus_ari))
                consensus_purities.append(consensus_ari)
        print('consensus results: ' + '\t'.join(map(str, consensus_purities)))
    other_results['clusterings'] = clusterings
    return results, names, other_results
Exemplo n.º 24
0
def multirun(datasetName):
    # datamat,datalabels = loadDataset("../dataset/glass.data")
    path = '../dataset/' + datasetName
    datamat, datalabels = loadDataset(path)
    print 'data ready'
    sampledData, remainedData, sampledIndex, remainedIndex = data_sample(
        datamat, 1, 2)
    print 'sampledData ready'

    pop_kmeans = rsnn(sampledData, remainedData, sampledIndex, remainedIndex,
                      'kmeans')
    print 'kmeans end'
    max_nmi1 = -inf
    for ind1 in pop_kmeans:
        nmi1 = normalized_mutual_info_score(datalabels, ind1)
        if nmi1 > max_nmi1:
            max_nmi1 = nmi1
    print '初始kmeans最大nmi为%s' % max_nmi1
    pop_ward = rsnn(sampledData, remainedData, sampledIndex, remainedIndex,
                    'ward')
    print 'ward end'
    max_nmi2 = -inf
    for ind2 in pop_ward:
        nmi2 = normalized_mutual_info_score(datalabels, ind2)
        if nmi2 > max_nmi2:
            max_nmi2 = nmi2
    print '初始ward最大nmi为%s' % max_nmi2
    pop_complete = rsnn(sampledData, remainedData, sampledIndex, remainedIndex,
                        'complete')
    print 'complete end'
    max_nmi3 = -inf
    for ind3 in pop_complete:
        nmi3 = normalized_mutual_info_score(datalabels, ind3)
        if nmi3 > max_nmi3:
            max_nmi3 = nmi3
    print '初始complete最大nmi为%s' % max_nmi3
    pop_average = rsnn(sampledData, remainedData, sampledIndex, remainedIndex,
                       'average')
    print 'average end'
    max_nmi4 = -inf
    for ind4 in pop_average:
        nmi4 = normalized_mutual_info_score(datalabels, ind4)
        if nmi4 > max_nmi4:
            max_nmi4 = nmi4
    print '初始average最大nmi为%s' % max_nmi4
    pop = []
    pop.extend(pop_kmeans)
    pop.extend(pop_ward)
    pop.extend(pop_complete)
    pop.extend(pop_average)

    init_population = []
    for indiv1 in pop:
        ind1 = creator.Individual(indiv1)
        init_population.append(ind1)

    filter_pop = filter(lambda x: len(x) > 0, init_population)  ##去除初始化聚类失败的结果
    population = filter_pop  #population是总的种群,后续的交叉算法的结果也要添加进来

    #为里第二个目标函数所用的矩阵,每个数据点的距离矩阵,计算一半
    # dataLen = len(datamat)
    # eudataPointMatrix = zeros((dataLen, dataLen))
    # for datai in range(dataLen):
    #     for dataj in range(datai+1,dataLen):
    #         eudataPointMatrix[datai][dataj] = Euclidean_dist(datamat[datai],datamat[dataj])
    distances_matrix = pairwise_distances(datamat,
                                          metric='euclidean')  # 数据集中数据点两两之间的距离
    print "数据点距离矩阵计算完毕"
    invalid_ind = [ind for ind in population if not ind.fitness.valid]
    # fitnesses = toolbox.map(toolbox.evaluate, tile(datamat,(len(invalid_ind),1,1)),tile(distances_matrix,(len(invalid_ind),1,1)),invalid_ind)
    # for ind, fit in zip(invalid_ind, fitnesses):
    #     ind.fitness.values = fit
    for ind1 in invalid_ind:
        euDistance1, eu_connect1 = mocle_index(datamat, distances_matrix, ind1)
        fitnesses1 = (euDistance1, eu_connect1)
        ind1.fitness.values = fitnesses1
    # population = toolbox.select(population, len(population))
    popeliteLen = len(population)
    for i in range(generation):
        print '第%s代' % i
        popElite = toolbox.select(population, popeliteLen)
        # Vary the population
        # parentSpring = tools.selTournamentDCD(popElite, popeliteLen)
        # parentSpring = [toolbox.clone(ind) for ind in parentSpring]
        newoffspring = []
        # applying crossover

        subpopArr = getSubPop(popElite)
        count = 0  # 计数增加几个新个体用
        for subpop in subpopArr:
            #dsce做交叉算子
            # a1=0.6
            # a2=0.5
            # transMatrix, popClusterArr_3, popClusterArr_2, clusterNumArr = transformation(datamat, subpop)
            # similiarMatrix = measureSimilarity(transMatrix, popClusterArr_3, popClusterArr_2,
            #                                                   clusterNumArr, datamat, a1=a1)
            # dictCownP = assign(similiarMatrix, a2)
            # resultList = resultTransform(dictCownP, datamat)
            #其他聚类集成算子
            hdf5_file_name = './Cluster_Ensembles.h5'
            fileh = tables.open_file(hdf5_file_name, 'w')
            fileh.create_group(fileh.root, 'consensus_group')
            fileh.close()
            subpop = np.array(subpop)
            hypergraph_adjacency = build_hypergraph_adjacency(subpop)
            store_hypergraph_adjacency(hypergraph_adjacency, hdf5_file_name)
            resultList = CE.CSPA(hdf5_file_name,
                                 subpop,
                                 verbose=True,
                                 N_clusters_max=3)
            resultList = list(resultList)

            clu = list(set(resultList))
            clulen = len(clu)
            actual_resultList = []

            if clulen > 1:
                ind_ensemble = creator.Individual(resultList)
                newoffspring.append(ind_ensemble)
                actual_resultList = resultList  #只有簇的数量不是1才会有子个体
                count += 1
            if actual_resultList:
                predicted_clusternum = len(set(actual_resultList))
                ind_new = KMeans(
                    n_clusters=predicted_clusternum).fit_predict(datamat)
                ind_new_tran = creator.Individual(ind_new)
                newoffspring.append(ind_new_tran)
                count += 1
        print "这一代增加里%s个个体" % count
        # evaluating fitness of individuals with invalid fitnesses
        invalid_ind = [ind for ind in newoffspring if not ind.fitness.valid]
        # fitnesses = toolbox.map(toolbox.evaluate, tile(datamat,(len(invalid_ind),1,1)),tile(distances_matrix,(len(invalid_ind),1,1)),invalid_ind)#这里只用了未经处理的数据,没有用到真实类别
        # for ind, fit in zip(invalid_ind, fitnesses):
        #     ind.fitness.values = fit

        for ind1 in invalid_ind:
            euDistance1, eu_connect1 = mocle_index(datamat, distances_matrix,
                                                   ind1)
            fitnesses1 = (euDistance1, eu_connect1)
            ind1.fitness.values = fitnesses1
        # Chossing a population for the next generation
        # population = toolbox.select(popElite + newoffspring, popeliteLen)
        population = popElite + newoffspring
    result1 = toolbox.nondominated(population, len(population))
    ari_arr = []
    max_ari = -inf
    for ind in result1[0]:
        ari = adjusted_rand_score(datalabels, ind)
        ari_arr.append(ari)
        if ari > max_ari:
            max_ari = ari
    nmi_arr = []
    max_nmi = -inf
    print 'nmi值'
    for ind in result1[0]:
        nmi = normalized_mutual_info_score(datalabels, ind)
        nmi_arr.append(nmi)
        if nmi > max_nmi:
            max_nmi = nmi
    print '最大nmi值为:%s' % max_nmi
    print nmi_arr
    return max_nmi, max_ari
Exemplo n.º 25
0
def ensembleClustering(nbClusterMax, labels1, labels2):
    cluster_runs = np.vstack((labels1, labels2))
    ceResult = CE.cluster_ensembles(cluster_runs, verbose=False, N_clusters_max=nbClusterMax)
    return ceResult
Exemplo n.º 26
0
def get_consensus_labels(cluster_runs, consensus_clustering_max_k, verbose=False):
    consensus_clustering_labels = CE.cluster_ensembles(cluster_runs, verbose=verbose,
                                                       N_clusters_max=consensus_clustering_max_k)
    return consensus_clustering_labels
Exemplo n.º 27
0
def test_cluster_ensemble():
    cluster_runs = np.random.randint(0, 50, (50, 15000))
    consensus_clustering_labels = CE.cluster_ensembles(cluster_runs,
                                                       verbose=True,
                                                       N_clusters_max=50)