예제 #1
0
def plot_consistency(labels, pos, mlset, nlset, savepath, consistency_type='both'):
    """
    plot consistency distribution of given library

    Parameters
    ----------
    :param labels:
    :param pos:
    :param mlset:
    :param nlset:
    :param savepath:
    :param consistency_type:
    """
    texts = []
    colors = []
    plot_labels = [None] * (len(labels) - _ADDITIONAL_RANGE)
    markers = ['o'] * (len(labels) - _ADDITIONAL_RANGE)
    for label in labels[0:-_ADDITIONAL_RANGE]:
        cons = Metrics.consistency(label, mlset, nlset, cons_type=consistency_type)
        texts.append(cons)
    cNorm = colors2.Normalize(vmin=min(texts), vmax=max(texts))
    scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=plt.get_cmap('CMRmap'))
    plot_labels.extend(_ADDITIONAL_NAMES)
    for text in texts:
        colors.append(scalarMap.to_rgba(text))
    texts = map(_round_digits, texts)
    texts.append('')
    texts.extend(_ADDITIONAL_NAMES[1:])
    colors.extend(_ADDITIONAL_COLORS)
    markers.extend(_ADDITIONAL_MARKERS)
    title = consistency_type + ' Consistency ,' + 'Max val = ' + str(max(texts[0:-_ADDITIONAL_RANGE])) +\
                               ' ,Min k = ' + str(min(texts[0:-_ADDITIONAL_RANGE]))
    _plot_generalized_scatter(pos, colors, texts, markers, plot_labels, savepath, title=title)
    return
예제 #2
0
def plot_normalized_consistency(labels, mlset, nlset, savepath, additional_values):
    """
    plot correlations between must and cannot consistency of given library

    Parameters
    ----------
    :param labels:
    :param mlset:
    :param nlset:
    :param savepath:
    :param additional_values:
    """
    texts = additional_values
    colors = []
    plot_labels = [None] * (len(labels) - _ADDITIONAL_RANGE)
    markers = ['o'] * (len(labels) - _ADDITIONAL_RANGE)
    cNorm = colors2.Normalize(vmin=min(texts), vmax=max(texts))
    scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=plt.get_cmap('CMRmap'))
    plot_labels.extend(_ADDITIONAL_NAMES)
    for text in texts:
        colors.append(scalarMap.to_rgba(text))
    title = 'Must-Cannot Correlation'
    must_consistencies = []
    cannot_consistencies = []
    for label in labels[0:-5]:
        must_cons = Metrics.consistency(label, mlset, nlset, cons_type='must')
        cannot_cons = Metrics.consistency(label, mlset, nlset, cons_type='cannot')
        must_consistencies.append(must_cons)
        cannot_consistencies.append(cannot_cons)
    scaler = preprocessing.MinMaxScaler()
    must_consistencies = scaler.fit_transform(np.array(must_consistencies).reshape(-1, 1))
    cannot_consistencies = scaler.fit_transform(np.array(cannot_consistencies).reshape(-1, 1))
    pos = np.hstack((np.array(must_consistencies), np.array(cannot_consistencies)))
    _plot_generalized_scatter(pos, colors, texts, markers, plot_labels, savepath, title=title,
                              xlabel='Must consistency', ylabel='Cannot consistency', legend_need=False)
    return
def _expected_consistency_selection(labels,
                                    mlset,
                                    nlset,
                                    cons_type='',
                                    ease_factor=1):
    n_solutions = labels.shape[0]
    k_values = []
    cons = []
    final_idx = np.array([False] * n_solutions)
    for label in labels:
        cons.append(
            Metrics.consistency(label, mlset, nlset, cons_type=cons_type))
        k_values.append(len(np.unique(label)))
    cons = np.array(cons)
    k_values = np.array(k_values, dtype=int)
    possible_k = np.unique(k_values)
    for k in possible_k:
        mean_value = np.mean(cons[k_values == k])
        idx = np.logical_and(cons >= mean_value * ease_factor, k_values == k)
        final_idx = np.logical_or(final_idx, idx)
    return labels[final_idx]
예제 #4
0
def plot_k_consistency_distribution(labels, mlset, nlset, savepath, pure=True, cons_type='must'):
    k_value = []
    if not pure:
        labels = labels[0:-5]
    for label in labels:
        cons = len(np.unique(label))
        k_value.append(cons)

    texts = [''] * len(labels)
    plot_labels = [None] * len(labels)
    markers = ['x'] * len(labels)
    colors = ['blue'] * len(labels)
    title = 'k-'+cons_type+' consistency Correlation'

    consistencies = []
    for label in labels:
        cons = Metrics.consistency(label, mlset, nlset, cons_type=cons_type)
        consistencies.append(cons)
    pos = np.hstack((np.array(k_value).reshape(-1, 1), np.array(consistencies).reshape(-1, 1)))
    print (pos.shape)

    _plot_generalized_scatter(pos, colors, texts, markers, plot_labels, savepath, title=title,
                              xlabel='k', ylabel='consistency', legend_need=False)
    return
def k_selection_ensemble(labels,
                         k_threshold,
                         logger,
                         weighted=False,
                         alpha=0,
                         mlset=None,
                         nlset=None,
                         ctype='both'):
    """
    do selection ensemble using k as criteria
    clusteing with k smaller than k_threshold will be removed

    :param labels:
    :param k_threshold:
    :param logger:
    :param weighted: weighted version or not
    :param alpha: balance factor that control the importance of clustering/cluster
                  consistency in weights (weighted version only)
    :param mlset: cannot-link set (weighted version only)
    :param nlset: must-link set (weighted version only)
    :param ctype: type of consistency (weighted version only)
    :return:
    """
    k_value = []
    class_num = len(np.unique(labels[-1]))
    # select those clusterings that k larger than the threshold.
    for label in labels[0:-5]:
        k_value.append(len(np.unique(label)))
    k_value = np.array(k_value)
    idx = k_value.ravel() >= k_threshold
    selected_labels = labels[0:-5][idx]

    # weights
    con_per_cluster = []
    con_clustering = []
    if weighted:
        for label in selected_labels:
            con_per_cluster.append(
                Metrics.consistency_per_cluster(label,
                                                mlset,
                                                nlset,
                                                cons_type=ctype))
        for label in selected_labels:
            con_clustering.append(
                Metrics.consistency(label, mlset, nlset, cons_type=ctype))

    logger.debug('[K] Start consensus...shape=' + str(selected_labels.shape))
    logger.debug('[K] Average k is ' + str(np.mean(k_value[idx])))
    if weighted:
        logger.debug('[K] weighted consensus, alpha=' + str(alpha))

    label_CSPA = ce.cluster_ensembles_CSPAONLY(
        selected_labels,
        N_clusters_max=class_num,
        weighted=weighted,
        clustering_weights=con_clustering,
        cluster_level_weights=con_per_cluster,
        alpha=alpha)
    label_HGPA = ce.cluster_ensembles_HGPAONLY(
        selected_labels,
        N_clusters_max=class_num,
        weighted=weighted,
        clustering_weights=con_clustering,
        cluster_level_weights=con_per_cluster,
        alpha=alpha)
    label_MCLA = ce.cluster_ensembles_MCLAONLY(
        selected_labels,
        N_clusters_max=class_num,
        weighted=weighted,
        clustering_weights=con_clustering,
        cluster_level_weights=con_per_cluster,
        alpha=alpha)

    nmi_CSPA = Metrics.normalized_max_mutual_info_score(label_CSPA, labels[-1])
    nmi_HGPA = Metrics.normalized_max_mutual_info_score(label_HGPA, labels[-1])
    nmi_MCLA = Metrics.normalized_max_mutual_info_score(label_MCLA, labels[-1])
    logger.debug('CSPA performance:' + str(nmi_CSPA))
    logger.debug('HGPA performance:' + str(nmi_HGPA))
    logger.debug('MCLA performance:' + str(nmi_MCLA))
    logger.debug('--------------------------------------------')
    return
def consistency_selection_ensemble(labels,
                                   mlset,
                                   nlset,
                                   logger,
                                   must_threshold,
                                   cannot_threshold,
                                   normalized=True,
                                   weighted=False,
                                   weighted_type='both',
                                   alpha=1):
    """
    do selection ensemble using must/cannot consistency as criteria
    clusteing with k smaller than k_threshold will be removed

    :param labels:
    :param mlset:
    :param nlset:
    :param logger:
    :param must_threshold:
    :param cannot_threshold:
    :param normalized:
    :param weighted:
    :param weighted_type:
    :param alpha:
    :return:
    """
    class_num = len(np.unique(labels[-1]))
    must_consistencies = []
    cannot_consistencies = []
    clustering_weights = []
    cluster_level_weights = []
    k_value = []
    for label in labels[0:-5]:
        must_cons = Metrics.consistency(label, mlset, nlset, cons_type='must')
        cannot_cons = Metrics.consistency(label,
                                          mlset,
                                          nlset,
                                          cons_type='cannot')
        if weighted:
            clustering_weights.append(
                Metrics.consistency(label,
                                    mlset,
                                    nlset,
                                    cons_type=weighted_type))
            cluster_level_weights.append(
                Metrics.consistency_per_cluster(label,
                                                mlset,
                                                nlset,
                                                cons_type=weighted_type))
        must_consistencies.append(must_cons)
        cannot_consistencies.append(cannot_cons)
        k_value.append(len(np.unique(label)))
    if normalized:
        scaler = preprocessing.MinMaxScaler()
        must_consistencies = scaler.fit_transform(
            np.array(must_consistencies).reshape(-1, 1)).ravel()
        cannot_consistencies = scaler.fit_transform(
            np.array(cannot_consistencies).reshape(-1, 1)).ravel()
    idx = np.logical_and(must_consistencies >= must_threshold,
                         cannot_consistencies >= cannot_threshold)
    selected_labels = labels[0:-5][idx]
    k_value = np.array(k_value)[idx]
    logger.debug('[Consistency] Start consensus...shape=' +
                 str(selected_labels.shape))
    if selected_labels.shape[0] == 0:
        logger.debug('[Consistency] No clusterings are selected. Out.')
        return
    logger.debug('[Consistency] Average k is ' + str(np.mean(k_value)))
    label_CSPA = ce.cluster_ensembles_CSPAONLY(
        selected_labels,
        N_clusters_max=class_num,
        weighted=weighted,
        clustering_weights=clustering_weights,
        cluster_level_weights=cluster_level_weights,
        alpha=alpha)
    label_HGPA = ce.cluster_ensembles_HGPAONLY(
        selected_labels,
        N_clusters_max=class_num,
        weighted=weighted,
        clustering_weights=clustering_weights,
        cluster_level_weights=cluster_level_weights,
        alpha=alpha)
    label_MCLA = ce.cluster_ensembles_MCLAONLY(selected_labels,
                                               N_clusters_max=class_num)
    nmi_CSPA = Metrics.normalized_max_mutual_info_score(label_CSPA, labels[-1])
    nmi_HGPA = Metrics.normalized_max_mutual_info_score(label_HGPA, labels[-1])
    nmi_MCLA = Metrics.normalized_max_mutual_info_score(label_MCLA, labels[-1])
    logger.debug('CSPA performance:' + str(nmi_CSPA))
    logger.debug('HGPA performance:' + str(nmi_HGPA))
    logger.debug('MCLA performance:' + str(nmi_MCLA))
    return
예제 #7
0
# subd = sub.feature_sampling(d, 2000)
# print d.shape
# print subd.shape
# data_selected, data_unselected, \
# target_selected, target_unselected = train_test_split(d, t,
#                                                       train_size=500,
#                                                       random_state=154)
# print data_selected
# print data_unselected
# print target_selected
# print target_unselected
# print d
# ml, cl = io.read_constraints('Constraints/Wap_constraints_2n.txt')
# ml, cl = io.read_constraints('Constraints/k1b_constraints_2n.txt')
ml, cl = io.read_constraints('Constraints/waveform_constraints_half_n.txt')
print metrics.consistency(t, ml, cl)
# e2cp = cc.E2CP(data=d, ml=ml, cl=cl, n_clusters=6)
# t1 = time.clock()
# e2cp.fit_constrained()
# t2 = time.clock()
# print t
# print np.unique(t)
# print metrics.normalized_max_mutual_info_score(t, e2cp.labels)
# print (t2 - t1)
# t1 = time.clock()
label = eck.cop_kmeans_wrapper(d, 3, ml, cl)
# t2 = time.clock()
# km = cluster.KMeans(n_clusters=20)
# km.fit(d)
print metrics.normalized_max_mutual_info_score(t, label)
# print (t2 - t1)
예제 #8
0
def do_new_weighted_ensemble_for_library(
        library_folder,
        library_name,
        class_num,
        target,
        constraint_file,
        logger,
        gammas,
        internals=None,
        cons_type='both',
        ensemble_method=_default_ensemble_method,
        scale=False):
    """

    :param library_folder:
    :param library_name:
    :param class_num:
    :param target:
    :param constraint_file:
    :param logger:
    :param alphas:
    :param cons_type:
    :param ensemble_method
    :return:
    """
    logger.debug(
        '==========================================================================================='
    )
    logger.debug('-----------------New ver Weighted Ensemble for library:' +
                 str(library_name) + '---------------')
    logger.debug('-----------------Weight type = ' + cons_type +
                 '-------------------------------------------')
    logger.debug('-----------------Scale type = ' + str(scale) +
                 '-------------------------------------------')
    logger.debug('-----------------Constraint File name = ' + constraint_file +
                 '----------------------------')

    labels = np.loadtxt(library_folder + library_name + '.res', delimiter=',')
    labels = labels.astype(int)

    # if the library is not pure, i.e, ensemble results and targets are also included.
    # then, last 5 rows should be removed (single kmeans, cspa, hgpa, mcla, real labels)
    if 'pure' not in library_name:
        labels = labels[0:-5]
    mlset, nlset = io_func.read_constraints(constraint_file)
    n_instances = labels.shape[1]
    if cons_type == 'both':
        n_constraints = len(mlset) + len(nlset)
    else:
        n_constraints = len(mlset)
    if internals is None:
        internals = _build_pesudo_internal(labels)

    # get cluster/clustering level weights
    # constraints in each cluster of all clusterings are also obtained to get g_gamma
    con_per_cluster = []
    constraints_num = []
    con_clustering = []
    cluster_time_sum = 0.0
    clustering_time_sum = 0.0
    for label in labels:
        t1 = time.clock()
        weight, cluster_cons_num = Metrics.consistency_per_cluster_efficient(
            label, mlset, nlset, cons_type=cons_type)
        con_per_cluster.append(weight)
        constraints_num.append(cluster_cons_num)
        t2 = time.clock()
        cluster_time_sum += (t2 - t1)
    for label in labels:
        t1 = time.clock()
        con_clustering.append(
            Metrics.consistency(label, mlset, nlset, cons_type=cons_type))
        t2 = time.clock()
        clustering_time_sum += (t2 - t1)

    print 'library size=' + str(labels.shape[0])
    print 'cluster avg=' + str(cluster_time_sum / labels.shape[0])
    print 'clustering avg=' + str(clustering_time_sum / labels.shape[0])

    if scale:
        scaler = preprocessing.MinMaxScaler()
        con_clustering = scaler.fit_transform(np.array(con_clustering))

    nmis = []
    for gamma in gammas:
        logger.debug(
            '-------------------------->>>>>> PARAM START <<<<<<<---------------------------------'
        )
        cur_g_gamma = get_g_gamma(constraints_num, labels, n_constraints,
                                  n_instances, gamma)
        cur_nmis = []
        for method in ensemble_method:
            ensemble_labels = _ensemble_method[method](
                labels,
                N_clusters_max=class_num,
                weighted=True,
                clustering_weights=con_clustering,
                cluster_level_weights=con_per_cluster,
                alpha=cur_g_gamma,
                new_formula=True,
                internal=internals)
            # ensemble_labels = _ensemble_method[method](labels, N_clusters_max=class_num,
            #                                            weighted=True, clustering_weights=con_clustering,
            #                                            cluster_level_weights=con_per_cluster, alpha=cur_g_gamma,
            #                                            new_formula=True, internal=internals, ml=mlset, cl=nlset)
            ensemble_nmi = Metrics.normalized_max_mutual_info_score(
                ensemble_labels, target)
            logger.debug(method + ' gamma=' + str(gamma) + ', NMI=' +
                         str(ensemble_nmi))
            cur_nmis.append(ensemble_nmi)
        nmis.append(cur_nmis)
        logger.debug(
            '------------------------->>>>>> END OF THIS PARAM <<<<<<-------------------------------'
        )
    logger.debug(
        '==========================================================================================='
    )
    return nmis
예제 #9
0
def do_7th_weighted_ensemble_for_library(
        library_folder,
        library_name,
        class_num,
        target,
        constraint_file,
        logger,
        alphas,
        internals,
        cons_type='both',
        ensemble_method=_default_ensemble_method,
        scale=False):
    """

    :param library_folder:
    :param library_name:
    :param class_num:
    :param target:
    :param constraint_file:
    :param logger:
    :param alphas:
    :param cons_type:
    :param ensemble_method
    :return:
    """
    logger.debug(
        '==========================================================================================='
    )
    logger.debug('-----------------New Weighted Ensemble for library:' +
                 str(library_name) + '-------------------')
    logger.debug('-----------------Weight type = ' + cons_type +
                 '-------------------------------------------')
    logger.debug('-----------------Scale type = ' + str(scale) +
                 '-------------------------------------------')
    logger.debug('-----------------Constraint File name = ' + constraint_file +
                 '----------------------------')

    labels = np.loadtxt(library_folder + library_name + '.res', delimiter=',')
    labels = labels.astype(int)
    k_values = []
    expected_cons = {}

    # if the library is not pure, i.e, ensemble results and targets are also included.
    # then, last 5 rows should be removed (single kmeans, cspa, hgpa, mcla, real labels)
    if 'pure' not in library_name:
        labels = labels[0:-5]
    mlset, nlset = io_func.read_constraints(constraint_file)

    # get cluster/clustering level weights
    con_per_cluster = []
    con_clustering = []
    for label in labels:
        con_per_cluster.append(
            Metrics.consistency_per_cluster(label,
                                            mlset,
                                            nlset,
                                            cons_type=cons_type))
    for label in labels:
        con_clustering.append(
            Metrics.consistency(label, mlset, nlset, cons_type=cons_type))
        k_values.append(len(np.unique(label)))
    k_values = np.array(k_values, dtype=int)
    possible_k = np.unique(k_values)
    cons = np.array(con_clustering)
    for k in possible_k:
        mean_value = np.mean(cons[k_values == k])
        if mean_value == 0:
            mean_value = 1
        expected_cons[k] = mean_value
    for i in range(0, labels.shape[0]):
        con_clustering[i] /= expected_cons[k_values[i]]
        con_clustering[i] *= internals[i]
    if scale:
        scaler = preprocessing.MinMaxScaler()
        con_clustering = scaler.fit_transform(np.array(con_clustering))

    nmis = []
    for alpha in alphas:
        logger.debug(
            '-------------------------->>>>>> PARAM START <<<<<<<---------------------------------'
        )
        cur_nmis = []
        for method in ensemble_method:
            ensemble_labels = _ensemble_method[method](
                labels,
                N_clusters_max=class_num,
                weighted=True,
                clustering_weights=con_clustering,
                cluster_level_weights=con_per_cluster,
                alpha=alpha)
            ensemble_nmi = Metrics.normalized_max_mutual_info_score(
                ensemble_labels, target)
            logger.debug(method + ' alpha=' + str(alpha) + ', NMI=' +
                         str(ensemble_nmi))
            cur_nmis.append(ensemble_nmi)
        nmis.append(cur_nmis)
        logger.debug(
            '------------------------->>>>>> END OF THIS PARAM <<<<<<-------------------------------'
        )
    logger.debug(
        '==========================================================================================='
    )
    return nmis