Пример #1
0
def mst_with_cutoff(distance_matrix,
                    pos,
                    savepath,
                    cutoff_type='rate',
                    cutoff_upper=0.9,
                    cutoff_lower=0.1,
                    interval=0.05):
    """
    Do several MST clustering on given library of solutions in a range of different 'cutoff' parameters
    Used for deciding the suitable cutoff value which will be utilized in the MST selection procedure

    Parameters
    ----------
    :param distance_matrix: distance matrix of given library of solutions
    :param pos: positions generated by MDS or other embedding approaches, used for visualization
    :param savepath: path where the visualization of clustering result stored
    :param cutoff_type: type of parameter 'cutoff', should either be :
                       'threshold' :  edges with larger weight than threshold will be cut
                       'rate' :       number(>=1) / fraction(<1.0) of edges that will be cut
    :param cutoff_upper: upper bound of parameter cutoff
    :param cutoff_lower: lower bound of parameter cutoff
    :param interval: interval of parameter cutoff

    """
    # do MST clustering by setting 'cutoff' between [cutoff_lower, cutoff_upper] with given interval
    cur_cutoff = cutoff_lower
    while cur_cutoff <= cutoff_upper:
        if cutoff_type == 'threshold':
            mstmodel = MSTClustering(cutoff_scale=cur_cutoff,
                                     min_cluster_size=2,
                                     metric='precomputed',
                                     approximate=False)
        else:
            mstmodel = MSTClustering(cutoff=cur_cutoff,
                                     min_cluster_size=2,
                                     metric='precomputed',
                                     approximate=False)
        mstmodel.fit(distance_matrix[0:-5, 0:-5])
        filename = savepath + str(cur_cutoff) + '.png'
        cv.plot_mst_result(mstmodel, pos, filename)
        cur_cutoff += interval
    return
Пример #2
0
def consistency_selection(nidpath,
                          pospath,
                          respath,
                          savepath,
                          mlset,
                          nlset,
                          distype='nid',
                          cutoff=0.9):
    """

    :param nidpath:
    :param pospath:
    :param respath:
    :param savepath:
    :param mlset:
    :param nlset:
    :param distype:
    :param cutoff:
    :return:
    """
    nidpath = os.path.expanduser(nidpath)
    for f in os.listdir(nidpath):
        if f.startswith('.'):
            continue
        fullpath = os.path.join(nidpath, f)
        if os.path.isfile(fullpath):
            fname = os.path.splitext(f)
            filename = fname[0].split('_' + distype)[0]
            dataset_name = filename.split('_')[0]
            if not os.path.isdir(savepath + dataset_name):
                os.mkdir(savepath + dataset_name)

            # read distance matrix, position matrix and label matrix from external file
            # note that the last 4 rows / cols are naive consensus & real labels
            distanceMatrix = np.loadtxt(fullpath, delimiter=',')
            pos = np.loadtxt(pospath + filename + '_mds2d.txt', delimiter=',')
            labels = np.loadtxt(respath + filename + '.res', delimiter=',')
            labels = labels.astype(int)

            # real labels store in the last row
            target = labels[-1]
            class_num = len(np.unique(target))

            # do mst clustering, we assume that there should be more than 5 solutions in each cluster
            mstmodel = MSTClustering(cutoff=cutoff,
                                     min_cluster_size=5,
                                     metric='precomputed')
            mstmodel.fit(distanceMatrix[0:-4, 0:-4])

            # compute average consistency of each cluster of solutions
            avg_cons = Metrics.average_consistency(mstmodel.labels_,
                                                   labels[0:-4], mlset, nlset)

            # find the cluster of solution with largest consistency
            maxclu = 0
            max_cons = 0.0
            print(avg_cons)
            for clu, cons in avg_cons.iteritems():
                if clu == -1:
                    continue
                if cons > max_cons:
                    maxclu = clu
                    max_cons = cons

            # do consensus, note that last 4 rows should be skipped
            cluster_labels = labels[0:-4][mstmodel.labels_ == maxclu]
            labels_CSPA = ce.cluster_ensembles_CSPAONLY(
                cluster_labels, N_clusters_max=class_num)
            labels_HGPA = ce.cluster_ensembles_HGPAONLY(
                cluster_labels, N_clusters_max=class_num)
            labels_MCLA = ce.cluster_ensembles_MCLAONLY(
                cluster_labels, N_clusters_max=class_num)

            # print labels and diversities (between the real labels)
            nmi_CSPA = 1 - Metrics.diversityBtw2Cluster(labels_CSPA, target)
            nmi_HGPA = 1 - Metrics.diversityBtw2Cluster(labels_HGPA, target)
            nmi_MCLA = 1 - Metrics.diversityBtw2Cluster(labels_MCLA, target)
            print('consensus result diversity (CSPA) =' + str(nmi_CSPA))
            print('consensus diversity (HGPA) =' + str(nmi_HGPA))
            print('consensus diversity (MCLA) =' + str(nmi_MCLA))

            # store visualization file using 2d-MDS
            fig = plt.figure(1)
            plt.clf()
            clusters = np.unique(mstmodel.labels_)
            for i in clusters:
                xs = pos[0:-4][mstmodel.labels_ == i, 0]
                ys = pos[0:-4][mstmodel.labels_ == i, 1]
                ax = plt.axes([0., 0., 1., 1.])
                if i == -1:
                    plt.scatter(xs,
                                ys,
                                c=_colors[((int(i) + 1) % len(_colors))],
                                label='Outliers')
                elif i == maxclu:
                    plt.scatter(xs,
                                ys,
                                c=_colors[((int(i) + 1) % len(_colors))],
                                marker='*',
                                label='Selected')
                else:
                    plt.scatter(xs,
                                ys,
                                c=_colors[((int(i) + 1) % len(_colors))],
                                label='Clusters-' + str(i))

            plt.scatter(pos[-4:-1, 0],
                        pos[-4:-1, 1],
                        c='blue',
                        marker='D',
                        label='Consensus')
            plt.scatter(pos[-1:, 0],
                        pos[-1:, 1],
                        c='red',
                        marker='D',
                        label='Real')
            plt.legend(loc='best', shadow=True)
            plt.savefig(savepath + dataset_name + '/' + filename +
                        '_afterMST_selection_' + str(cutoff) + '.png',
                        format='png',
                        dpi=240)

    return
Пример #3
0
def all_cluster_consensus(nidpath, respath, distype='nid', cutoff=0.9):
    """

    :param nidpath:
    :param respath:
    :param distype:
    :param cutoff:
    :return:
    """
    nidpath = os.path.expanduser(nidpath)
    for f in os.listdir(nidpath):
        if f.startswith('.'):
            continue
        fullpath = os.path.join(nidpath, f)
        if os.path.isfile(fullpath):
            fname = os.path.splitext(f)
            filename = fname[0].split('_' + distype)[0]
            dataset_name = filename.split('_')[0]

            # read distance matrix, position matrix and label matrix from external file
            # note that the last 4 rows / cols are naive consensus & real labels
            print(fullpath)
            distanceMatrix = np.loadtxt(fullpath, delimiter=',')
            labels = np.loadtxt(respath + filename + '.res', delimiter=',')
            labels = labels.astype(int)

            # real labels store in the last row
            target = labels[-1]
            class_num = len(np.unique(target))

            # do mst clustering, we assume that there should be more than 5 solutions in each cluster
            mstmodel = MSTClustering(cutoff=cutoff,
                                     min_cluster_size=5,
                                     metric='precomputed')
            mstmodel.fit(distanceMatrix[0:-4, 0:-4])

            clusters = np.unique(mstmodel.labels_)
            for i in clusters:
                # do consensus, note that last 4 rows should be skipped
                cluster_labels = labels[0:-4][mstmodel.labels_ == i]
                labels_CSPA = ce.cluster_ensembles_CSPAONLY(
                    cluster_labels, N_clusters_max=class_num)
                labels_HGPA = ce.cluster_ensembles_HGPAONLY(
                    cluster_labels, N_clusters_max=class_num)
                labels_MCLA = ce.cluster_ensembles_MCLAONLY(
                    cluster_labels, N_clusters_max=class_num)

                # print labels and diversities (between the real labels)
                nmi_CSPA = 1 - Metrics.diversityBtw2Cluster(
                    labels_CSPA, target)
                nmi_HGPA = 1 - Metrics.diversityBtw2Cluster(
                    labels_HGPA, target)
                nmi_MCLA = 1 - Metrics.diversityBtw2Cluster(
                    labels_MCLA, target)
                print('Cluster ' + str(i) +
                      '===========================================')
                print('consensus result diversity (CSPA) =' + str(nmi_CSPA))
                print('consensus diversity (HGPA) =' + str(nmi_HGPA))
                print('consensus diversity (MCLA) =' + str(nmi_MCLA))

    return
Пример #4
0
def mst_with_cutoff(distance_matrix,
                    pos,
                    labels,
                    savepath,
                    logger,
                    mlset,
                    nlset,
                    cutoff_type='rate',
                    cutoff_upper=0.9,
                    cutoff_lower=0.1,
                    interval=0.05,
                    min_cluster_size=2,
                    top_k=5):
    """
    Do several MST clustering on given library of solutions in a range of different 'cutoff' parameters
    Used for deciding the suitable cutoff value which will be utilized in the MST selection procedure

    Parameters
    ----------
    :param distance_matrix: distance matrix of given library of solutions
    :param pos: positions generated by MDS or other embedding approaches, used for visualization
    :param savepath: path where the visualization of clustering result stored
    :param cutoff_type: type of parameter 'cutoff', should either be :
                       'threshold' :  edges with larger weight than threshold will be cut
                       'rate' :       number(>=1) / fraction(<1.0) of edges that will be cut
    :param cutoff_upper: upper bound of parameter cutoff
    :param cutoff_lower: lower bound of parameter cutoff
    :param interval: interval of parameter cutoff
    :param min_cluster_size:

    """
    # do MST clustering by setting 'cutoff' between [cutoff_lower, cutoff_upper] with given interval
    cluster_num = {}
    cur_cutoff = cutoff_lower
    while cur_cutoff <= cutoff_upper:
        if cutoff_type == 'threshold':
            mstmodel = MSTClustering(cutoff_scale=cur_cutoff,
                                     min_cluster_size=min_cluster_size,
                                     metric='precomputed',
                                     approximate=False)
        else:
            mstmodel = MSTClustering(cutoff=cur_cutoff,
                                     min_cluster_size=min_cluster_size,
                                     metric='precomputed',
                                     approximate=False)
        mstmodel.fit(distance_matrix[0:-5, 0:-5])
        # filename = savepath + str(cur_cutoff) + '.png'
        cluster_num[cur_cutoff] = len(np.unique(mstmodel.labels_))
        # cv.plot_mst_result(mstmodel, pos, filename)
        cur_cutoff += interval
    sorted_cluster_num = sorted(cluster_num.iteritems(),
                                key=lambda item: item[1],
                                reverse=True)
    while top_k > 0:
        consensus_cutoff = sorted_cluster_num[top_k - 1][0]
        all_cluster_consensus(distance_matrix,
                              labels,
                              pos,
                              savepath,
                              logger,
                              mlset,
                              nlset,
                              cutoff=consensus_cutoff)
        top_k -= 1
    return
Пример #5
0
def all_cluster_consensus(distance_matrix,
                          labels,
                          pos,
                          savepath,
                          logger,
                          mlset,
                          nlset,
                          cutoff=0.9):
    """

    :param distance_matrix:
    :param labels:
    :param pos:
    :param savepath:
    :param cutoff:
    :return:
    """
    target = labels[-1]
    class_num = len(np.unique(target))

    # do mst clustering, we assume that there should be more than 2 solutions in each cluster
    mstmodel = MSTClustering(cutoff=cutoff,
                             min_cluster_size=2,
                             metric='precomputed')
    mstmodel.fit(distance_matrix[0:-5, 0:-5])
    filename = savepath + str(cutoff) + '.png'
    cv.plot_mst_result(mstmodel, pos, filename)

    clusters = np.unique(mstmodel.labels_)

    # compute average consistency of each cluster of solutions
    avg_cons_both = Metrics.average_consistency(mstmodel.labels_, labels[0:-5],
                                                mlset, nlset)
    avg_cons_must = Metrics.average_consistency(mstmodel.labels_,
                                                labels[0:-5],
                                                mlset,
                                                nlset,
                                                cons_type='must')
    avg_cons_cannot = Metrics.average_consistency(mstmodel.labels_,
                                                  labels[0:-5],
                                                  mlset,
                                                  nlset,
                                                  cons_type='cannot')

    for i in clusters:
        # do consensus, note that last 4 rows should be skipped
        cluster_labels = labels[0:-4][mstmodel.labels_ == i]
        labels_CSPA = ce.cluster_ensembles_CSPAONLY(cluster_labels,
                                                    N_clusters_max=class_num)
        labels_HGPA = ce.cluster_ensembles_HGPAONLY(cluster_labels,
                                                    N_clusters_max=class_num)
        labels_MCLA = ce.cluster_ensembles_MCLAONLY(cluster_labels,
                                                    N_clusters_max=class_num)

        # print labels and diversities (between the real labels)
        nmi_CSPA = Metrics.normalized_max_mutual_info_score(
            labels_CSPA, target)
        nmi_HGPA = Metrics.normalized_max_mutual_info_score(
            labels_HGPA, target)
        nmi_MCLA = Metrics.normalized_max_mutual_info_score(
            labels_MCLA, target)
        logger.debug('Cluster ' + str(i) + ':')
        logger.debug('Both Consistency is ' + str(avg_cons_both[i]))
        logger.debug('Must Consistency is ' + str(avg_cons_must[i]))
        logger.debug('Cannot Consistency is ' + str(avg_cons_cannot[i]))
        logger.debug('CSPA performance is ' + str(nmi_CSPA))
        logger.debug('HGPA performance is ' + str(nmi_HGPA))
        logger.debug('MCLA performance is ' + str(nmi_MCLA))
        logger.debug(
            '-----------------------------mst finish--------------------------------'
        )
        logger.debug('')
    return