Python MSTClustering примеры использования

Язык программирования: Python

Пространство имен/Пакет: MSTClustering

Класс/Тип: MSTClustering

Примеров на hotexamples.com: 5

Python MSTClustering - 5 примеров найдено. Это лучшие примеры Python кода для MSTClustering.MSTClustering, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

MSTClustering(5)

Основные методы

MSTClustering (5)

Пример #1

Показать файл

Файл: test_mst.py Проект: zhouce7/cluster_ensemble

def mst_with_cutoff(distance_matrix,
                    pos,
                    savepath,
                    cutoff_type='rate',
                    cutoff_upper=0.9,
                    cutoff_lower=0.1,
                    interval=0.05):
    """
    Do several MST clustering on given library of solutions in a range of different 'cutoff' parameters
    Used for deciding the suitable cutoff value which will be utilized in the MST selection procedure

    Parameters
    ----------
    :param distance_matrix: distance matrix of given library of solutions
    :param pos: positions generated by MDS or other embedding approaches, used for visualization
    :param savepath: path where the visualization of clustering result stored
    :param cutoff_type: type of parameter 'cutoff', should either be :
                       'threshold' :  edges with larger weight than threshold will be cut
                       'rate' :       number(>=1) / fraction(<1.0) of edges that will be cut
    :param cutoff_upper: upper bound of parameter cutoff
    :param cutoff_lower: lower bound of parameter cutoff
    :param interval: interval of parameter cutoff

    """
    # do MST clustering by setting 'cutoff' between [cutoff_lower, cutoff_upper] with given interval
    cur_cutoff = cutoff_lower
    while cur_cutoff <= cutoff_upper:
        if cutoff_type == 'threshold':
            mstmodel = MSTClustering(cutoff_scale=cur_cutoff,
                                     min_cluster_size=2,
                                     metric='precomputed',
                                     approximate=False)
        else:
            mstmodel = MSTClustering(cutoff=cur_cutoff,
                                     min_cluster_size=2,
                                     metric='precomputed',
                                     approximate=False)
        mstmodel.fit(distance_matrix[0:-5, 0:-5])
        filename = savepath + str(cur_cutoff) + '.png'
        cv.plot_mst_result(mstmodel, pos, filename)
        cur_cutoff += interval
    return

Пример #2

Показать файл

def consistency_selection(nidpath,
                          pospath,
                          respath,
                          savepath,
                          mlset,
                          nlset,
                          distype='nid',
                          cutoff=0.9):
    """

    :param nidpath:
    :param pospath:
    :param respath:
    :param savepath:
    :param mlset:
    :param nlset:
    :param distype:
    :param cutoff:
    :return:
    """
    nidpath = os.path.expanduser(nidpath)
    for f in os.listdir(nidpath):
        if f.startswith('.'):
            continue
        fullpath = os.path.join(nidpath, f)
        if os.path.isfile(fullpath):
            fname = os.path.splitext(f)
            filename = fname[0].split('_' + distype)[0]
            dataset_name = filename.split('_')[0]
            if not os.path.isdir(savepath + dataset_name):
                os.mkdir(savepath + dataset_name)

            # read distance matrix, position matrix and label matrix from external file
            # note that the last 4 rows / cols are naive consensus & real labels
            distanceMatrix = np.loadtxt(fullpath, delimiter=',')
            pos = np.loadtxt(pospath + filename + '_mds2d.txt', delimiter=',')
            labels = np.loadtxt(respath + filename + '.res', delimiter=',')
            labels = labels.astype(int)

            # real labels store in the last row
            target = labels[-1]
            class_num = len(np.unique(target))

            # do mst clustering, we assume that there should be more than 5 solutions in each cluster
            mstmodel = MSTClustering(cutoff=cutoff,
                                     min_cluster_size=5,
                                     metric='precomputed')
            mstmodel.fit(distanceMatrix[0:-4, 0:-4])

            # compute average consistency of each cluster of solutions
            avg_cons = Metrics.average_consistency(mstmodel.labels_,
                                                   labels[0:-4], mlset, nlset)

            # find the cluster of solution with largest consistency
            maxclu = 0
            max_cons = 0.0
            print(avg_cons)
            for clu, cons in avg_cons.iteritems():
                if clu == -1:
                    continue
                if cons > max_cons:
                    maxclu = clu
                    max_cons = cons

            # do consensus, note that last 4 rows should be skipped
            cluster_labels = labels[0:-4][mstmodel.labels_ == maxclu]
            labels_CSPA = ce.cluster_ensembles_CSPAONLY(
                cluster_labels, N_clusters_max=class_num)
            labels_HGPA = ce.cluster_ensembles_HGPAONLY(
                cluster_labels, N_clusters_max=class_num)
            labels_MCLA = ce.cluster_ensembles_MCLAONLY(
                cluster_labels, N_clusters_max=class_num)

            # print labels and diversities (between the real labels)
            nmi_CSPA = 1 - Metrics.diversityBtw2Cluster(labels_CSPA, target)
            nmi_HGPA = 1 - Metrics.diversityBtw2Cluster(labels_HGPA, target)
            nmi_MCLA = 1 - Metrics.diversityBtw2Cluster(labels_MCLA, target)
            print('consensus result diversity (CSPA) =' + str(nmi_CSPA))
            print('consensus diversity (HGPA) =' + str(nmi_HGPA))
            print('consensus diversity (MCLA) =' + str(nmi_MCLA))

            # store visualization file using 2d-MDS
            fig = plt.figure(1)
            plt.clf()
            clusters = np.unique(mstmodel.labels_)
            for i in clusters:
                xs = pos[0:-4][mstmodel.labels_ == i, 0]
                ys = pos[0:-4][mstmodel.labels_ == i, 1]
                ax = plt.axes([0., 0., 1., 1.])
                if i == -1:
                    plt.scatter(xs,
                                ys,
                                c=_colors[((int(i) + 1) % len(_colors))],
                                label='Outliers')
                elif i == maxclu:
                    plt.scatter(xs,
                                ys,
                                c=_colors[((int(i) + 1) % len(_colors))],
                                marker='*',
                                label='Selected')
                else:
                    plt.scatter(xs,
                                ys,
                                c=_colors[((int(i) + 1) % len(_colors))],
                                label='Clusters-' + str(i))

            plt.scatter(pos[-4:-1, 0],
                        pos[-4:-1, 1],
                        c='blue',
                        marker='D',
                        label='Consensus')
            plt.scatter(pos[-1:, 0],
                        pos[-1:, 1],
                        c='red',
                        marker='D',
                        label='Real')
            plt.legend(loc='best', shadow=True)
            plt.savefig(savepath + dataset_name + '/' + filename +
                        '_afterMST_selection_' + str(cutoff) + '.png',
                        format='png',
                        dpi=240)

    return

Пример #3

Показать файл

def all_cluster_consensus(nidpath, respath, distype='nid', cutoff=0.9):
    """

    :param nidpath:
    :param respath:
    :param distype:
    :param cutoff:
    :return:
    """
    nidpath = os.path.expanduser(nidpath)
    for f in os.listdir(nidpath):
        if f.startswith('.'):
            continue
        fullpath = os.path.join(nidpath, f)
        if os.path.isfile(fullpath):
            fname = os.path.splitext(f)
            filename = fname[0].split('_' + distype)[0]
            dataset_name = filename.split('_')[0]

            # read distance matrix, position matrix and label matrix from external file
            # note that the last 4 rows / cols are naive consensus & real labels
            print(fullpath)
            distanceMatrix = np.loadtxt(fullpath, delimiter=',')
            labels = np.loadtxt(respath + filename + '.res', delimiter=',')
            labels = labels.astype(int)

            # real labels store in the last row
            target = labels[-1]
            class_num = len(np.unique(target))

            # do mst clustering, we assume that there should be more than 5 solutions in each cluster
            mstmodel = MSTClustering(cutoff=cutoff,
                                     min_cluster_size=5,
                                     metric='precomputed')
            mstmodel.fit(distanceMatrix[0:-4, 0:-4])

            clusters = np.unique(mstmodel.labels_)
            for i in clusters:
                # do consensus, note that last 4 rows should be skipped
                cluster_labels = labels[0:-4][mstmodel.labels_ == i]
                labels_CSPA = ce.cluster_ensembles_CSPAONLY(
                    cluster_labels, N_clusters_max=class_num)
                labels_HGPA = ce.cluster_ensembles_HGPAONLY(
                    cluster_labels, N_clusters_max=class_num)
                labels_MCLA = ce.cluster_ensembles_MCLAONLY(
                    cluster_labels, N_clusters_max=class_num)

                # print labels and diversities (between the real labels)
                nmi_CSPA = 1 - Metrics.diversityBtw2Cluster(
                    labels_CSPA, target)
                nmi_HGPA = 1 - Metrics.diversityBtw2Cluster(
                    labels_HGPA, target)
                nmi_MCLA = 1 - Metrics.diversityBtw2Cluster(
                    labels_MCLA, target)
                print('Cluster ' + str(i) +
                      '===========================================')
                print('consensus result diversity (CSPA) =' + str(nmi_CSPA))
                print('consensus diversity (HGPA) =' + str(nmi_HGPA))
                print('consensus diversity (MCLA) =' + str(nmi_MCLA))

    return

Пример #4

Показать файл

Файл: test_mst.py Проект: zhouce7/cluster_ensemble

def mst_with_cutoff(distance_matrix,
                    pos,
                    labels,
                    savepath,
                    logger,
                    mlset,
                    nlset,
                    cutoff_type='rate',
                    cutoff_upper=0.9,
                    cutoff_lower=0.1,
                    interval=0.05,
                    min_cluster_size=2,
                    top_k=5):
    """
    Do several MST clustering on given library of solutions in a range of different 'cutoff' parameters
    Used for deciding the suitable cutoff value which will be utilized in the MST selection procedure

    Parameters
    ----------
    :param distance_matrix: distance matrix of given library of solutions
    :param pos: positions generated by MDS or other embedding approaches, used for visualization
    :param savepath: path where the visualization of clustering result stored
    :param cutoff_type: type of parameter 'cutoff', should either be :
                       'threshold' :  edges with larger weight than threshold will be cut
                       'rate' :       number(>=1) / fraction(<1.0) of edges that will be cut
    :param cutoff_upper: upper bound of parameter cutoff
    :param cutoff_lower: lower bound of parameter cutoff
    :param interval: interval of parameter cutoff
    :param min_cluster_size:

    """
    # do MST clustering by setting 'cutoff' between [cutoff_lower, cutoff_upper] with given interval
    cluster_num = {}
    cur_cutoff = cutoff_lower
    while cur_cutoff <= cutoff_upper:
        if cutoff_type == 'threshold':
            mstmodel = MSTClustering(cutoff_scale=cur_cutoff,
                                     min_cluster_size=min_cluster_size,
                                     metric='precomputed',
                                     approximate=False)
        else:
            mstmodel = MSTClustering(cutoff=cur_cutoff,
                                     min_cluster_size=min_cluster_size,
                                     metric='precomputed',
                                     approximate=False)
        mstmodel.fit(distance_matrix[0:-5, 0:-5])
        # filename = savepath + str(cur_cutoff) + '.png'
        cluster_num[cur_cutoff] = len(np.unique(mstmodel.labels_))
        # cv.plot_mst_result(mstmodel, pos, filename)
        cur_cutoff += interval
    sorted_cluster_num = sorted(cluster_num.iteritems(),
                                key=lambda item: item[1],
                                reverse=True)
    while top_k > 0:
        consensus_cutoff = sorted_cluster_num[top_k - 1][0]
        all_cluster_consensus(distance_matrix,
                              labels,
                              pos,
                              savepath,
                              logger,
                              mlset,
                              nlset,
                              cutoff=consensus_cutoff)
        top_k -= 1
    return

Пример #5

Показать файл

Файл: test_mst.py Проект: zhouce7/cluster_ensemble

def all_cluster_consensus(distance_matrix,
                          labels,
                          pos,
                          savepath,
                          logger,
                          mlset,
                          nlset,
                          cutoff=0.9):
    """

    :param distance_matrix:
    :param labels:
    :param pos:
    :param savepath:
    :param cutoff:
    :return:
    """
    target = labels[-1]
    class_num = len(np.unique(target))

    # do mst clustering, we assume that there should be more than 2 solutions in each cluster
    mstmodel = MSTClustering(cutoff=cutoff,
                             min_cluster_size=2,
                             metric='precomputed')
    mstmodel.fit(distance_matrix[0:-5, 0:-5])
    filename = savepath + str(cutoff) + '.png'
    cv.plot_mst_result(mstmodel, pos, filename)

    clusters = np.unique(mstmodel.labels_)

    # compute average consistency of each cluster of solutions
    avg_cons_both = Metrics.average_consistency(mstmodel.labels_, labels[0:-5],
                                                mlset, nlset)
    avg_cons_must = Metrics.average_consistency(mstmodel.labels_,
                                                labels[0:-5],
                                                mlset,
                                                nlset,
                                                cons_type='must')
    avg_cons_cannot = Metrics.average_consistency(mstmodel.labels_,
                                                  labels[0:-5],
                                                  mlset,
                                                  nlset,
                                                  cons_type='cannot')

    for i in clusters:
        # do consensus, note that last 4 rows should be skipped
        cluster_labels = labels[0:-4][mstmodel.labels_ == i]
        labels_CSPA = ce.cluster_ensembles_CSPAONLY(cluster_labels,
                                                    N_clusters_max=class_num)
        labels_HGPA = ce.cluster_ensembles_HGPAONLY(cluster_labels,
                                                    N_clusters_max=class_num)
        labels_MCLA = ce.cluster_ensembles_MCLAONLY(cluster_labels,
                                                    N_clusters_max=class_num)

        # print labels and diversities (between the real labels)
        nmi_CSPA = Metrics.normalized_max_mutual_info_score(
            labels_CSPA, target)
        nmi_HGPA = Metrics.normalized_max_mutual_info_score(
            labels_HGPA, target)
        nmi_MCLA = Metrics.normalized_max_mutual_info_score(
            labels_MCLA, target)
        logger.debug('Cluster ' + str(i) + ':')
        logger.debug('Both Consistency is ' + str(avg_cons_both[i]))
        logger.debug('Must Consistency is ' + str(avg_cons_must[i]))
        logger.debug('Cannot Consistency is ' + str(avg_cons_cannot[i]))
        logger.debug('CSPA performance is ' + str(nmi_CSPA))
        logger.debug('HGPA performance is ' + str(nmi_HGPA))
        logger.debug('MCLA performance is ' + str(nmi_MCLA))
        logger.debug(
            '-----------------------------mst finish--------------------------------'
        )
        logger.debug('')
    return