示例#1
0
    def test_synthetic_circles(self):
        print('''
            two concentric circles
        ''')
        N = 10**3
        X, y = make_circles(n_samples=N, noise=1.0)
        k = len(np.unique(y))

        X_incomplete = create_incomplete_matrix(X)
        labels, _, X_hat = kmeans_missing(X_incomplete, k)

        sklearn_mse = ((X - X_hat)**2).mean()
        score = metrics.homogeneity_completeness_v_measure(labels, y)
        print(f'sklearn mse: {sklearn_mse}')
        print(f'sklearn scores: {score}')

        displacements = np.nan_to_num(X_incomplete)

        spans = np.nan_to_num(X_incomplete)
        spans[spans == 0] = 1
        spans[spans != 1] = 0

        L = SetOfLines(spans, displacements, np.ones(N), np.ones(N))
        config = ParameterConfig()

        ## data
        m = 100  # coreset size ~ reduction ratio
        tau = 1e-2

        config.a_b_approx_minimum_number_of_lines = 100  # constant 100, line 2, algo 2 BI-CRITERIA
        config.sample_size_for_a_b_approx = int(
            m * 1.05)  # |S| >= m, line 3 of algo 2
        # note: there'll be a O(|S|^2) cost while computing algo 1
        config.farthest_to_centers_rate_in_a_b_approx = 4 / 11  # opp of 7/11, line 6, algo 2 BI-CRITERIA
        config.number_of_remains_multiply_factor = int(
            math.log(N)
        ) // k  # this is `b` in algo 2, other paper, set as random here -  how to calculate it?
        config.closest_to_median_rate = (1 - tau) / (
            2 * k)  # refer line 4, algo 1, other paper
        config.median_sample_size = int(
            N * 0.05)  # size of q_i, line 3, algo 2, other paper
        config.max_sensitivity_multiply_factor = 100  # for outliers in coresets
        config.number_of_remains = 20

        SAMPLE_SIZE = 50  # keep it < 100, works fast

        ITER = 5
        klines_mse = np.zeros(ITER)
        scores = [[]] * ITER
        for i in range(ITER):
            print(f'Running KLines iter {i+1} of {ITER}')
            X_klines, kl_labels = customStreamer(L, k, m, SAMPLE_SIZE, config)
            klines_mse[i] = ((X - X_klines)**2).mean()
            scores[i] = metrics.homogeneity_completeness_v_measure(
                kl_labels, y)

        print(f"Klines MSE: {klines_mse.mean()}")
        print(f"Klines scores: {np.array(scores).mean(axis=0)}")

        assert sklearn_mse / klines_mse.mean() > 0.5
示例#2
0
文件: car.py 项目: kraemerk/CS4641hw3
def expectation_maximization(X,y,dataset_name):
    X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=65)
    train_scores = []
    train_homo = []
    train_completeness = []
    train_v_score = []

    test_scores = []
    test_homo = []
    test_completeness = []
    test_v_score = []

    kvals = [x for x in range(2,51)]

    for k in range(2, 51):
        print("k= {}".format(k))
        clf = GaussianMixture(n_components=k, max_iter=1000)
        # Train on train data, recording accuracy, homogeneity, completeness, and v_measure
        train_pred = clf.fit_predict(X_train)
        train_score = fowlkes_mallows_score(y_train, train_pred)
        train_scores.append(train_score)
        homogeneity, completeness, v_measure = homogeneity_completeness_v_measure(y_train, train_pred)
        train_homo.append(homogeneity)
        train_completeness.append(completeness)
        train_v_score.append(v_measure)

        # Evaluate same metrics on test set
        test_pred = clf.predict(X_test)
        test_score = fowlkes_mallows_score(y_test, test_pred)
        test_scores.append(test_score)
        homogeneity, completeness, v_measure = homogeneity_completeness_v_measure(y_test, test_pred)
        test_homo.append(homogeneity)
        test_completeness.append(completeness)
        test_v_score.append(v_measure)

    print("done")
    print("generating plots")

    plt.figure()
    plt.title('Folkes-Mallows Score of Expectation Maximization on {} Dataset'.format(dataset_name))
    plt.xlabel('Number of Components')
    plt.ylabel('Folkes-Mallows Score')
    plt.plot(kvals, train_scores, label='Training Score')
    plt.plot(kvals, test_scores, label='Test Score')
    plt.legend(loc='upper left')
    plt.show(block=False)

    plt.figure()
    plt.title('Performance Metrics of Expectation Maximization on {} Dataset'.format(dataset_name))
    plt.xlabel('K Value (Number of Clusters)')
    plt.ylabel('Score (Range 0.0 to 1.0)')
    plt.plot(kvals, train_homo, label='Training Homogeneity')
    plt.plot(kvals, test_homo, label='Test Homogeneity')
    plt.plot(kvals, train_completeness, label='Training Completeness')
    plt.plot(kvals, test_completeness, label='Test Completeness')
    plt.plot(kvals, train_v_score, label='Training V-Measure')
    plt.plot(kvals, test_v_score, label='Test V-Measure')
    plt.legend(loc='upper left')
    plt.show(block=False)
示例#3
0
def getClusteringEvalPlots(dataset):
    noOfClusters = range(2, 11, 1)

    for ds in dataset:
        sse = [[]]
        sil = [[[], []]]
        scores = [[[], []], [[], []], [[], []], [[], []], [[], []]]
        for cluster in noOfClusters:
            kmLearner = Clustering.KM(n_clusters=cluster)
            kmLearner.getLearner().fit(ds.training_x)
            emLearner = Clustering.EM(n_components=cluster)
            emLearner.getLearner().fit(ds.training_x)
            clustringY_KM = kmLearner.getLearner().predict(ds.training_x)
            clustringY_EM = emLearner.getLearner().predict(ds.training_x)
            homogeneityKM, completenessKM, v_measureKM = homogeneity_completeness_v_measure(ds.training_y, clustringY_KM)
            AMISKM = adjusted_mutual_info_score(ds.training_y, clustringY_KM)
            ARSKM = adjusted_rand_score(ds.training_y, clustringY_KM)
            silhouetteKM = silhouette_score(ds.training_x, clustringY_KM)
            homogeneityEM, completenessEM, v_measureEM = homogeneity_completeness_v_measure(ds.training_y, clustringY_EM)
            AMISEM = adjusted_mutual_info_score(ds.training_y, clustringY_EM)
            ARSEM = adjusted_rand_score(ds.training_y, clustringY_EM)
            silhouetteEM = silhouette_score(ds.training_x, clustringY_EM)

            sse.append(kmLearner.getLearner().inertia_)
            sil[0][0].append(silhouetteKM)
            scores[0][0].append(v_measureKM)
            scores[1][0].append(AMISKM)
            scores[2][0].append(ARSKM)
            scores[3][0].append(homogeneityKM)

            sil[0][1].append(silhouetteEM)
            scores[0][1].append(v_measureEM)
            scores[1][1].append(AMISEM)
            scores[2][1].append(ARSEM)
            scores[3][1].append(homogeneityEM)

        plt.style.use('seaborn-whitegrid')
        plt.plot(noOfClusters, sil[0][0], label='Silhouette Score, KM', marker='o')
        plt.plot(noOfClusters, sil[0][1], label='Silhouette Score, EM', marker='o', linestyle='--')
        plt.ylabel('Silhouette Score', fontsize=12)
        plt.xlabel('K', fontsize=12)
        plt.title('Silhouette Plot for ' + ds.name, fontsize=12, y=1.03)
        plt.legend()
        plt.savefig('Figures/Clustering/Silhouette for ' + ds.name + '.png')
        plt.close()

        plt.style.use('seaborn-whitegrid')
        plt.plot(noOfClusters, scores[0][0], label='V Measure, KM', marker='o')
        plt.plot(noOfClusters, scores[1][0], label='Adj. Mutual Info, KM', marker='o')
        plt.plot(noOfClusters, scores[2][0], label='Adj. Rand. Score, KM', marker='o')
        plt.plot(noOfClusters, scores[0][1], label='V Measure, EM', marker='o', linestyle='--')
        plt.plot(noOfClusters, scores[1][1], label='Adj. Mutual Info, EM', marker='o', linestyle='--')
        plt.plot(noOfClusters, scores[2][1], label='Adj. Rand. Score, EM', marker='o', linestyle='--')
        plt.ylabel('Score', fontsize=12)
        plt.xlabel('K', fontsize=12)
        plt.title('Score Plot for ' + ds.name, fontsize=12, y=1.03)
        plt.legend()
        plt.savefig('Figures/Clustering/Score for ' + ds.name + '.png')
        plt.close()
    def test_benchmark_Chainlink(self):
        print('Clustering Chainlink.npz')
        npzfile = np.load('data/Chainlink.npz')
        X, y = npzfile['X'], npzfile['y']
        (N, _), k = X.shape, np.unique(y).shape[0]
        print(f'#Datapoints {N}')

        X_incomplete = create_incomplete_matrix(X)
        labels, _, X_hat = kmeans_missing(X_incomplete, k)

        sklearn_mse = ((X - X_hat)**2).mean()
        score = metrics.homogeneity_completeness_v_measure(labels, y)
        print(f'MSE sklearn: {sklearn_mse}')
        print(f'MSE scores/measures: {score}')

        displacements = np.nan_to_num(X_incomplete)

        spans = np.nan_to_num(X_incomplete)
        spans[spans == 0] = 1
        spans[spans != 1] = 0

        L = SetOfLines(spans, displacements, np.ones(N), np.ones(N))
        config = ParameterConfig()

        ## data
        m = 60  # coreset size ~ reduction ratio
        tau = 1e-2

        config.a_b_approx_minimum_number_of_lines = 40  # constant 100, line 2, algo 2 BI-CRITERIA
        config.sample_size_for_a_b_approx = int(
            m * 1.05)  # |S| >= m, line 3 of algo 2
        # note: there'll be a O(|S|^2) cost while computing algo 1
        config.farthest_to_centers_rate_in_a_b_approx = 4 / 11  # opp of 7/11, line 6, algo 2 BI-CRITERIA
        config.number_of_remains_multiply_factor = int(
            math.log(N)
        ) // k  # this is `b` in algo 2, other paper, set as random here -  how to calculate it?
        config.closest_to_median_rate = (1 - tau) / (
            2 * k)  # refer line 4, algo 1, other paper
        config.median_sample_size = int(
            N * 0.05)  # size of q_i, line 3, algo 2, other paper
        config.max_sensitivity_multiply_factor = 100  # for outliers in coresets
        config.number_of_remains = 20

        SAMPLE_SIZE = 50

        ITER = 5
        klines_mse = np.zeros(ITER)
        scores = [[]] * ITER
        for i in range(ITER):
            print(f'Running KLines iter {i+1} of {ITER}')
            X_klines, kl_labels = customStreamer(L, k, m, SAMPLE_SIZE, config)
            klines_mse[i] = ((X - X_klines)**2).mean()
            scores[i] = metrics.homogeneity_completeness_v_measure(
                kl_labels, y)

        print(f"Klines MSE: {klines_mse.mean()}")
        print(f"Scores: {np.array(scores).mean(axis=0)}")

        assert sklearn_mse / klines_mse.mean() > 0.8
def test_avg_clustering_with_model_selection(db_dirs,
                                             method,
                                             val_dirs_count=2):
    bestStatistic, prevStatistic = 0, 0
    val_dirs_count = len(db_dirs)  #hack!!!
    if use_clustering == rankorder_clustering:
        bestThreshold = (0, 0)
        for distanceThreshold in np.linspace(1.02, 1.1, 9):
            prevStatistic = 0
            bestChanged = False
            for rankThreshold in range(12, 22, 2):
                currentStatistic = 0
                for i, db_dir in enumerate(db_dirs[:val_dirs_count]):
                    num_of_classes, num_of_clusters, y_true, y_pred = get_clustering_results(
                        db_dir, method, (distanceThreshold, rankThreshold))
                    #bcubed_precision,bcubed_recall,bcubed_fmeasure=BCubed_stat(y_true, y_pred)
                    #currentStatistic+=bcubed_fmeasure
                    homogeneity, completeness, v_measure = metrics.homogeneity_completeness_v_measure(
                        y_true, y_pred)
                    currentStatistic += v_measure
                    #print(num_of_classes)
                currentStatistic /= val_dirs_count
                print(distanceThreshold, rankThreshold, currentStatistic)
                if currentStatistic > bestStatistic:
                    bestStatistic = currentStatistic
                    bestThreshold = (distanceThreshold, rankThreshold)
                    bestChanged = True
                if currentStatistic <= prevStatistic:  #-0.01
                    break
                prevStatistic = currentStatistic
            if not bestChanged:
                break
    else:
        bestThreshold = 0
        for distanceThreshold in np.linspace(0.6, 1.3, 71):
            currentStatistic = 0
            for i, db_dir in enumerate(db_dirs[:val_dirs_count]):
                num_of_classes, num_of_clusters, y_true, y_pred = get_clustering_results(
                    db_dir, method, distanceThreshold)
                #bcubed_precision,bcubed_recall,bcubed_fmeasure=BCubed_stat(y_true, y_pred)
                #currentStatistic+=bcubed_fmeasure
                homogeneity, completeness, v_measure = metrics.homogeneity_completeness_v_measure(
                    y_true, y_pred)
                currentStatistic += v_measure
                #print(num_of_classes)
            currentStatistic /= val_dirs_count
            #print(distanceThreshold,currentStatistic)
            if currentStatistic > bestStatistic:
                bestStatistic = currentStatistic
                bestThreshold = distanceThreshold
            if currentStatistic < prevStatistic - 0.01:
                break
            prevStatistic = currentStatistic

    print('method:', method, 'bestParams:', bestThreshold, 'bestStatistic:',
          bestStatistic)
    #test_avg_clustering(db_dirs[val_dirs_count:],method,bestThreshold)
    test_avg_clustering(db_dirs, method, bestThreshold)  #hack!!!
示例#6
0
def test():
    from sklearn.metrics import homogeneity_completeness_v_measure
    from sklearn.cluster import KMeans
    from time import time

    # mat = np.random.random([500, 250])
    # mat[mat > 0.5] = 1
    # mat[mat <= 0.5] = 0

    n_clusters = 40
    mat = np.random.random([n_clusters, 250])
    mat[mat > 0.7] = 1
    mat[mat <= 0.7] = 0

    mats = []
    labels = []
    n_samples = 1500
    for i in xrange(mat.shape[0]):
        m = np.zeros([n_samples, mat.shape[1]])
        l = np.zeros(n_samples, dtype=np.int32) + i
        m[:, :] = mat[i]
        for j in xrange(n_samples):
            inds = np.random.permutation(np.arange(mat.shape[1]))[:50]
            m[j, inds] = 1 - m[j, inds]

        mats.append(m)
        labels.append(l)

    mat = np.concatenate(mats)
    labels = np.concatenate(labels)

    inds = np.random.permutation(np.arange(mat.shape[0]))
    mat = mat[inds]
    labels = labels[inds]

    st = time()
    modes, clusters = kmodes_fit(mat, n_clusters, 20, 3000)
    print "elapsed time:", time() - st

    # print modes.shape
    # print modes
    # print clusters.shape
    # print clusters

    st = time()
    clusters_km = KMeans(n_clusters, max_iter=20, n_init=1,
                         tol=0).fit_predict(mat)
    print "elapsed time:", time() - st

    print homogeneity_completeness_v_measure(labels, clusters)
    print homogeneity_completeness_v_measure(labels, clusters_km)
示例#7
0
def test():
    from sklearn.metrics import homogeneity_completeness_v_measure
    from sklearn.cluster import KMeans
    from time import time

    # mat = np.random.random([500, 250])
    # mat[mat > 0.5] = 1
    # mat[mat <= 0.5] = 0

    n_clusters = 40
    mat = np.random.random([n_clusters, 250])
    mat[mat > 0.7] = 1
    mat[mat <= 0.7] = 0

    mats = []
    labels = []
    n_samples = 1500
    for i in xrange(mat.shape[0]):
        m = np.zeros([n_samples, mat.shape[1]])
        l = np.zeros(n_samples, dtype=np.int32) + i
        m[:, :] = mat[i]
        for j in xrange(n_samples):
            inds = np.random.permutation(np.arange(mat.shape[1]))[:50]
            m[j, inds] = 1 - m[j, inds]

        mats.append(m)
        labels.append(l)

    mat = np.concatenate(mats)
    labels = np.concatenate(labels)

    inds = np.random.permutation(np.arange(mat.shape[0]))
    mat = mat[inds]
    labels = labels[inds]

    st = time()
    modes, clusters = kmodes_fit(mat, n_clusters, 20, 3000)
    print "elapsed time:", time() - st

    # print modes.shape
    # print modes
    # print clusters.shape
    # print clusters

    st = time()
    clusters_km = KMeans(n_clusters, max_iter=20, n_init=1, tol=0).fit_predict(mat)
    print "elapsed time:", time() - st

    print homogeneity_completeness_v_measure(labels, clusters)
    print homogeneity_completeness_v_measure(labels, clusters_km)
示例#8
0
    def tracking(self, d_start=gb.D_START_TRACKING, d_end=gb.D_END_TRACKING, path=""):
        print("\n --------- tracking ...")

        times_fsp, axes_fsp, labels_fsp = [], [], []
        times_ssp, axes_ssp, labels_ssp = [], [], []

        timedelta = datetime.timedelta(
            milliseconds=60 * 60 * 1000)  # read chunk by chunk (each chunk is of 'timedelta' milliseconds)
        date = d_start
        while date < d_end:
            if date + timedelta >= d_end: timedelta = d_end - date

            times, axes, labels = self.predict_fsp(d_start=date, d_end=date + timedelta)
            # self.plot_colored_signals(times, axes, labels, path, figname="_FSP.png")
            times_fsp += times;
            axes_fsp += axes;
            labels_fsp += labels

            times, axes, labels = self.predict_ssp(d_start=date, d_end=date + timedelta, update=True)
            # self.plot_colored_signals(times, axes, labels, path, figname="_SSP.png")
            times_ssp += times;
            axes_ssp += axes;
            labels_ssp += labels

            date += timedelta

        # ----------------------------
        if gb.ARTIFICIAL:
            times, values, true_labels = self.sigReaders[0].getSignal(start=d_start, end=d_end, dated=gb.DATED,
                                                                      get_modes=True)

            ari_fps = adjusted_rand_score(true_labels, labels_fsp);
            ari_sps = adjusted_rand_score(true_labels, labels_ssp)
            ami_fps = adjusted_mutual_info_score(true_labels, labels_fsp);
            ami_sps = adjusted_mutual_info_score(true_labels, labels_ssp)
            ho_fps, com_fps, vm_fps = homogeneity_completeness_v_measure(true_labels, labels_fsp);
            ho_sps, com_sps, vm_sps = homogeneity_completeness_v_measure(true_labels, labels_ssp)

            print("---------------------------------------------------")
            print("adjusted_rand_score \t (ari_fps, ari_sps)", (ari_fps, ari_sps))
            print("adjusted_mutual_info \t (ami_fps, ami_sps)", (ami_fps, ami_sps))
            print("homogeneity \t (ho_fps, ho_sps)", (ho_fps, ho_sps))
            print("completeness \t (com_fps, com_sps)", (com_fps, com_sps))
            print("v_measure \t (vm_fps, vm_sps)", (vm_fps, vm_sps))

            #return (ari_fps, ari_sps), (ami_fps, ami_sps), (ho_fps, ho_sps), (com_fps, com_sps), (vm_fps, vm_sps)
            return ((ari_fps, ari_sps), (ami_fps, ami_sps), (ho_fps, ho_sps), (com_fps, com_sps), (vm_fps, vm_sps)), (times_fsp,axes_fsp,labels_fsp,times_ssp,axes_ssp,labels_ssp)

        else:
            return 0., 0.
示例#9
0
def clustering_performance_evaluation(X, y_pred, y_true):
    """
    this function implement multiple evaluation metrics for clustering analysis.
    this method will be used in order to asses the quality of a clustering solution based on multiple criteria
    :param X: input matrix
    :param y_pred: predicted vector
    :param y_true: ground truth - if none - one do not have this knowledge
    :return: a dictionary with all measures
    """

    result = {}
    result['ARI'] = metrics.adjusted_rand_score(y_true, y_pred)
    result['AMI'] = metrics.adjusted_mutual_info_score(y_true, y_pred)
    result['NMI'] = metrics.normalized_mutual_info_score(y_true, y_pred)
    h, c, v = metrics.homogeneity_completeness_v_measure(y_true, y_pred)
    result['H**o'] = h
    result['Comp'] = c
    result['V'] = v
    result['FM'] = metrics.fowlkes_mallows_score(y_true, y_pred)

    result['Sil'] = metrics.silhouette_score(X[['entropy', 'joint_entropy']],
                                             y_pred,
                                             metric='euclidean')

    return result
示例#10
0
def K_Means_RFE(feature_set, label_set, depth_index, score_spread):
    """A recursive function to extract the best features and score vs # of features"""
    if len(feature_set[0]) == 1:
        return (-2, -2, -2), [], score_spread
    best_features = []
    max_sil_score = (-2, -2, -2
                     )  #since range is 0 to 1, this will be overridden
    top_features = []
    for i in range(len(feature_set[0])):
        sub_set = numpy.delete(feature_set, i, 1)
        kmeans = KMeans(n_clusters=8, n_init=10)
        kmeans = kmeans.fit(sub_set)
        sil_score = metrics.homogeneity_completeness_v_measure(
            label_set, kmeans.labels_)
        if sil_score[2] > max_sil_score[2]:
            max_sil_score = sil_score
            top_features = sub_set
    score_spread = numpy.insert(score_spread, 0, max_sil_score[2])

    print("Now entering depth: ", depth_index + 1)
    best_score, best_features, score_spread = K_Means_RFE(
        top_features, label_set, depth_index + 1, score_spread)
    if max_sil_score[2] > best_score[2]:
        best_score = max_sil_score
        best_features = top_features
    print("Now leaving depth: ", depth_index)
    return best_score, best_features, score_spread
示例#11
0
def clustering_metrics(labels_pred, labels_true = None, feature = None):
    '''
    聚类算法结果评估
    需要真实标签:
        兰德指数 ARI: 输入参数没有顺序要求,ARI值的范围是[-1,1],
            负的结果都是较差的,说明标签是独立分布的,相似分布的ARI结果是正的,
            1是最佳结果,说明两种标签的分布完全一致
        互信息 AMI:输入参数没有顺序要求,最好的值为1,最差的值(与labels_true不相关),其结果为非正值
        同质性、完整性、两者的调和平均V-measure:从0到1反应出最差到最优的表现
        Fowlkes-Mallows指数:针对训练集和验证集数据之间求得的查全率和查准率的几何平均值
        
    不需要真实标签:        
        轮廓系数:取值范围是[-1,1],同类别样本距离越相近不同类别样本距离越远,分数越高。
        Calinski-Harabaz Index:分数值越大则聚类效果越好        
    '''
    
    if labels_true is not None:
        print u'兰德指数 ARI: ', metrics.adjusted_rand_score(labels_true, labels_pred)
        print u'互信息 AMI: ', metrics.adjusted_mutual_info_score(labels_true, labels_pred)
        print u'同质性、完整性、两者的调和平均V-measure: ', metrics.homogeneity_completeness_v_measure(labels_true, labels_pred)
        print u'Fowlkes-Mallows指数 FMI: ', metrics.fowlkes_mallows_score(labels_true, labels_pred)
        
    if feature is not None:
        print u'轮廓系数: ', metrics.silhouette_score(feature, labels_pred, metric='euclidean')
        print u'Calinski-Harabaz Index: ', metrics.calinski_harabaz_score(feature, labels_pred) 
示例#12
0
    def score(self: 'Frame2D',
              score_frame: 'Frame2D',
              label_ix: int = -1,
              glcm_radius=None):
        """ Scores the current frame kmeans with a scoring image

        :param label_ix: The label index to score against score_frame
        :param score_frame: The score as Frame2D
        :param glcm_radius: The radius of GLCM used if applicable. This will crop the Frame2D automatically to fit.
        :return: A Dictionary of various scoring algorithm results,
            {'Custom', 'Homogeneity', 'Completeness', 'V Measure'}
        """
        # Convert grayscale to labels
        if glcm_radius is not None:
            score_frame = score_frame.crop_glcm(glcm_radius)
        true = self.labelize(score_frame.data[..., 0]).flatten()
        pred = self.data[..., label_ix].flatten()

        score = self.scorer_pair(true, pred)['score'],\
                *homogeneity_completeness_v_measure(true, pred)
        return {
            "Custom": score[0],
            "Homogeneity": score[1],
            "Completeness": score[2],
            "V Measure": score[3]
        }
示例#13
0
def get_homogeneity_completeness_v_measure(labels_pred, labels_anno):
    """
    homogeneity_completeness_v_measure
    """
    h, c, v = metrics.homogeneity_completeness_v_measure(
        labels_anno, labels_pred)
    return h, c, v
示例#14
0
def results(X_test, y_test, clf=None):
    if clf is None:
        clf = cluster.KMeans(n_clusters=4, init='random').fit(X_test)

    preds = clf.predict(X_test)
    ans = pd.DataFrame({'label': y_test.values, 'kmean': preds})
    print(preds)
    print("y_test:   ", y_test)

    ans = ans.groupby(['kmean', 'label']).size()
    print(ans)

    correct = sum([
        anom if anom > norm else norm
        for anom, norm in zip(ans[::2], ans[1::2])
    ])

    print(correct)
    print(sum(ans))
    print("Total accuracy: {0:.1%}".format(correct / sum(ans)))

    y_test = y_test.tolist()

    for x in range(len(y_test)):
        if (y_test[x] == "attack"):
            y_test[x] = 1
        else:
            y_test[x] = 0

    print(homogeneity_completeness_v_measure(y_test, preds))
    print("ac ", metrics.accuracy_score(y_test, preds))
    print(confusion_matrix(y_test, preds))

    return clf
示例#15
0
def kmeans_clustering(X_train, y_train, X_test, y_test, genre_list):
    scalar = StandardScaler()
    scalar.fit(X_train, y_train)
    new_data = scalar.transform(X_train)
    kmeans = KMeans(init='k-means++', n_init=10, n_clusters=4, max_iter=300)
    rVal = kmeans.fit(X_train, y_train)
    kmeans_predictions = kmeans.predict(X_test)
    print("the randomized score is : ",
          metrics.adjusted_rand_score(y_test, kmeans_predictions))
    print("the normalized mutual info score is : ",
          metrics.normalized_mutual_info_score(y_test, kmeans_predictions))
    print("the mutual info score is : ",
          metrics.mutual_info_score(y_test, kmeans_predictions))
    print(
        "the homogenity, completeness and v measure score is : ",
        metrics.homogeneity_completeness_v_measure(y_test, kmeans_predictions))
    print("the fowlkes mallows score is : ",
          metrics.fowlkes_mallows_score(y_test, kmeans_predictions))
    labels = kmeans.labels_
    print(
        "the silhouette score is :",
        metrics.silhouette_score(X_test,
                                 kmeans_predictions,
                                 metric='euclidean'))
    print(kmeans_predictions)
    print(y_test)
    centers = rVal.cluster_centers_
    distances = pairwise_distances(new_data, centers, metric='euclidean')
    clusters = np.argmin(distances, axis=1)
    print(len(clusters))
    plotSamples = PCA(n_components=2).fit_transform(new_data)
    plotClusters(plotSamples, clusters, kmeans)
    joblib.dump(kmeans, 'saved_models/model_kmeans.pkl')
示例#16
0
def baseline_cluster(data, act_labels, k, output_folder, experiment_name):
    start_time = time.time()
    clusters = np.random.randint(0, k, size=data.shape[0])
    end_time = time.time()
    final_time = end_time - start_time
    h, c, v = homogeneity_completeness_v_measure(act_labels, clusters)
    return clusters, h, c, v, final_time
示例#17
0
def show_clustering_info(x: np.ndarray, y_true: np.ndarray, y_predicted: np.ndarray, folder: str = 'results',
                         filename: str = 'genes', extension: str = 'xlsx', sheet_name: str = 'results') -> None:
    """
    Shows information about the predicted data and saves them to an excel file.

    :param x: the x data.
    :param y_true: the known label values.
    :param y_predicted: the predicted label values.
    :param folder: the folder to save the results excel file.
    :param filename: the name of the excel file.
    :param extension: the file's extension.
    :param sheet_name: the excel's sheet name.
    """
    hcv = metrics.homogeneity_completeness_v_measure(y_true, y_predicted)

    # Create results dictionary.
    results = {'Adjusted Random Index': [metrics.adjusted_rand_score(y_true, y_predicted)],
               'Homogeneity': [hcv[0]],
               'Completeness': [hcv[1]],
               'V Measure': [hcv[2]],
               'Silhouette Coefficient': [metrics.silhouette_score(x, y_predicted)]}

    # Log results.
    logger.log('Model\'s Results:')
    for key, values in results.items():
        for value in values:
            logger.log('{text}: {number:.{points}g}'.format(text=key, number=value, points=4))

    # Create excel if save is True.
    if SAVE_PRED_RESULTS:
        helpers.utils.create_excel(results, folder, filename, extension, sheet_name)
示例#18
0
def generate_eval_dict(gt, pred):
    # Put all the metrics values in a dictionary and return them
    eval_dict = {}
    # Compute all the traditional metrics
    eval_dict['homogeneity'], eval_dict['completeness'], eval_dict['v_measure'] = \
        homogeneity_completeness_v_measure(gt, pred)
    eval_dict['nmi'] = normalized_mutual_info_score(gt, pred)
    eval_dict['rand'] = adjusted_rand_score(gt, pred)
    eval_dict['munkres'] = munkres_score([gt], [pred])
    eval_dict['ari'] = adjusted_rand_score(gt, pred)

    # Compute all the new metrics
    eval_dict['rss_substring'] = repeated_structure_score(gt, pred, with_purity=True, substring=True)
    eval_dict['transs'] = transition_structure_score(gt, pred)
    eval_dict['transs_flip'] = transition_structure_score(pred, gt)
    eval_dict['lass'] = label_agnostic_segmentation_score(gt, pred)
    eval_dict['sss_combined'] = segment_structure_score_new(gt, pred)
    eval_dict['tss_combined'] = temporal_structure_score_new(gt, pred)
    eval_dict['tss_combined-10'] = temporal_structure_score_new(gt, pred, beta=10.)
    eval_dict['tss_combined-0,1'] = temporal_structure_score_new(gt, pred, beta=0.1)
    eval_dict['tss_combined-5'] = temporal_structure_score_new(gt, pred, beta=5.)
    eval_dict['tss_combined-0,5'] = temporal_structure_score_new(gt, pred, beta=0.5)
    eval_dict['tss_combined-2'] = temporal_structure_score_new(gt, pred, beta=2.)
    eval_dict['tss_combined-0,2'] = temporal_structure_score_new(gt, pred, beta=0.2)

    return eval_dict
示例#19
0
def spectral_cluster_evaluate(data, labels, n_cluster, affinity="rbf"):
    """

    :param data: 相似度矩阵 or 嵌入向量
    :param n_cluster:
    :param affinity: precomputed || rbf
    :return:
    """
    metric = "euclidean"
    if affinity == "precomputed":
        # sklearn指导,如果data是距离矩阵而不是相似度矩阵,则可以用下面的rbf转换一下
        distance_mat = data
        delta = math.sqrt(2)
        data = np.exp(-distance_mat**2 / (2. * delta**2))
        metric = affinity

    clustering = SpectralClustering(n_clusters=n_cluster,
                                    affinity=affinity,
                                    n_init=50,
                                    random_state=42)
    preds = clustering.fit_predict(data)
    h, c, v = metrics.homogeneity_completeness_v_measure(labels, preds)
    s1 = metrics.silhouette_score(embeddings, labels, metric=metric)
    s2 = metrics.silhouette_score(embeddings, preds, metric=metric)

    print(
        f"homogenetiy: {h}, completeness: {c}, v_measure: {v}, silhouette_score label: {s1}, silhouette_score pred: {s2}\n"
    )
def bestClassify(X,Y):
	"Best classifier function"
	tfidf = True

	if tfidf:
		vec = TfidfVectorizer(preprocessor = identity,
							tokenizer = identity, )
	else:
		vec = CountVectorizer(preprocessor = identity,
							tokenizer = identity)

	km = KMeans(n_clusters=6, n_init=10, verbose=1)
	clusterer = Pipeline( [('vec', vec),
								('cls', km)] )

	clusterer.fit(X)
	prediction = clusterer.predict(X)

	checker = defaultdict(list)
	for pred,truth in zip(prediction,Y):
		checker[pred].append(truth)

	labeldict = {}
	for pred, label in checker.items():
		labeldict[pred] = Counter(label).most_common(1)[0][0]
		#print(pred, Counter(label).most_common(1)[0][0])

	prediction = [labeldict[p] for p in prediction]
	labels = list(labeldict.values())
	print(labels)
	print(confusion_matrix(Y, prediction, labels=labels))

	print("Rand-Index:", adjusted_rand_score(Y,prediction))
	print(homogeneity_completeness_v_measure(Y, prediction))
示例#21
0
def eval_2(labels_true, labels_pred, is_show=True):
    """
    有监督的评估
    评价指标 越接近 1 越好
    :param labels_true:
    :param labels_pred:
    :param is_show:  是否显示结果
    :return:
    """
    if labels_true == []:
        info = f"cluster: img_sum:{len(labels_pred)}, id_sum:{len(set(labels_pred))}"
        return [], info
    nmi = 0  # metrics.normalized_mutual_info_score(labels_true, labels_pred)  # 归一化互信息
    ari = metrics.adjusted_rand_score(labels_true, labels_pred)  # 调整兰德指数
    # 纯度,散度, v_measure
    homogeneity, completeness, v_measure_score = metrics.homogeneity_completeness_v_measure(
        labels_true, labels_pred)
    fmi = metrics.fowlkes_mallows_score(labels_true, labels_pred)  # 几何平均数
    avg_pre, avg_rec, fscore = fowlkes_mallows_score(
        labels_true, labels_pred)  # 调和平均数 *****
    k = 0.5
    fscore_2 = 2. * avg_pre * k * avg_rec / (avg_pre * k + avg_rec)

    s_1 = f"gt: img_sum:{len(labels_true)}, id_sum:{len(set(labels_true))}"
    s_2 = f"cluster: img_sum:{len(labels_pred)}, id_sum:{len(set(labels_pred))}"
    s_3 = "有监督: 纯度, 散度, nmi, v_measure, ari:" + f"{r(homogeneity)}, {r(completeness)}, {r(nmi)}, {r(v_measure_score)}, {r(ari)}"
    s_4 = 'avg_pre, avg_rec, fscore, fmi:' + f"{r(avg_pre)}, {r(avg_rec)}, {r(fscore)}, {r(fmi)}"
    info = f"{s_1}\n{s_2}\n{s_3}\n{s_4}"
    if is_show:
        print(info)
    metric = [avg_pre, avg_rec, fscore, fmi]
    return metric, info
示例#22
0
 def computeHomogeneityCompleteness(self, labels_families,
                                    predicted_clusters):
     if labels_families is None:
         self.homogeneity, self.completeness, self.v_measure = 0, 0, 0
         return
     self.homogeneity, self.completeness, self.v_measure = \
             metrics.homogeneity_completeness_v_measure(labels_families, predicted_clusters)
示例#23
0
文件: utils.py 项目: AkChen/PBMvCL
def prin_clustering(test_rep, test_label, NUM_OF_CLASS):
    # 聚类
    km = KMeans(n_clusters=NUM_OF_CLASS)
    #km.fit_transform(test_rep)
    cls_rs = km.fit_predict(test_rep)
    # ARI
    ari = metrics.adjusted_rand_score(test_label, cls_rs)
    # AMI
    ami = metrics.adjusted_mutual_info_score(test_label, cls_rs)
    # H,C,V
    H, C, V = metrics.homogeneity_completeness_v_measure(test_label, cls_rs)
    # FMI
    fmi = metrics.fowlkes_mallows_score(test_label, cls_rs)
    # s
    # s = metrics.silhouette_score(test_label, cls_rs)
    # DBI
    # dbi = metrics.davies_bouldin_score(test_label, cls_rs)
    # nmi
    nmi = metrics.normalized_mutual_info_score(test_label, cls_rs)

    d = dict()
    d['ari'] = ari
    d['ami'] = ami
    d['nmi'] = nmi
    d['fmi'] = fmi
    d['H'] = H
    d['C'] = C
    d['V'] = V

    print('ARI:%.4f,AMI:%.4f,HCV:%.4f %.4f %.4f FMI:%.4f NMI:%.4f' %
          (ari, ami, H, C, V, fmi, nmi))
    return d
示例#24
0
def cluster_evaluate(embeddings, labels, n_class, metric="euclidean"):
    """
        Unsupervised setting: We assess the ability of each method to embed close together nodes
        with the same ground-truth structural role. We use agglomerative clustering (with single linkage)
        to cluster embeddings learned by each method and evaluate the clustering quality via:
            (1) homogeneity, conditional entropy of ground-truth structural roles given the predicted clustering;
            (2) completeness, a measure of how many nodes with the same ground-truth structural role are assigned to the same cluster;
            (3) silhouette score, a measure of intra-cluster distance vs. inter-cluster distance.

        Supervised setting: We assess the performance of learned embeddings for node classifcation.
        Using 10-fold cross validation, we predict the structural role (label) of each node in the test set
        based on its 4-nearest neighbors in the training set as determined by the embedding space.
        The reported score is then the average accuracy and F1-score over 25 trials.
    """
    clusters = AgglomerativeClustering(n_clusters=n_class,
                                       linkage='single',
                                       affinity=metric).fit_predict(embeddings)
    h, c, v = metrics.homogeneity_completeness_v_measure(labels, clusters)
    s = metrics.silhouette_score(embeddings, clusters)
    acc = accuracy_score(labels, clusters)
    macro_f1 = f1_score(labels, clusters, average="macro")
    print("cluster:", clusters, "labels:", labels)
    print("accuracy: ", acc)
    print("macro_score: ", macro_f1)
    print("homogeneity: ", h)
    print("completeness: ", c)
    print("v-score: ", v)
    print("silhouette: ", s)

    return h, c, v, s
示例#25
0
文件: utils.py 项目: firebitsbr/UAHL
def results_evaluation_phase2(actual_labels, predicted_labels):
    start_time = datetime.now()

    print('  +| Extracting details of the resulting clusters...')
    actual_vs_predicted_labels_df = pd.DataFrame({'actual_labels': actual_labels, 'predicted_labels': predicted_labels})
    clusters_details = []    # The list includes details of each clusters (Number of items & items distribution)
    # Extracts clusters' details
    for c in [x for x in sorted(set(predicted_labels)) if x >= 0]:      # Evaluates each cluster (except outliers)
        details = dict(actual_vs_predicted_labels_df.query("predicted_labels == @c ")[
            'actual_labels'].value_counts())                                     # Counts actual labels inside the cluster

        details = {i: details[i] for i in sorted(details.keys())}                           # Sorts directory by keys
        clusters_details.append('       # Cluster [%s] contains %d items. Details:%s' % (str(
            c), list(predicted_labels).count(c), details))   # Adds the cluster's details to the list

    # Extracts outliers details
    if -1 in actual_vs_predicted_labels_df["predicted_labels"].tolist():
        outliers_details = dict(actual_vs_predicted_labels_df.query("predicted_labels == -1")['actual_labels'].value_counts())
        outliers_details = dict(sorted(outliers_details.items(), key=lambda x: x[0]))
    else:
        outliers_details = ""

    # Calculates homogeneity, completeness, Vmeasure, AR, and AMI scores
    warnings.filterwarnings('ignore')   # Ignores outliers
    print('  +| Calculating the clustering evaluation metrics...')
    MetricWithoutOtl = actual_vs_predicted_labels_df[actual_vs_predicted_labels_df['predicted_labels'] != -1]
    P2_hom_com_vmet = (homogeneity_completeness_v_measure(MetricWithoutOtl['actual_labels'], MetricWithoutOtl['predicted_labels']))
    P2_AR_Score = (adjusted_rand_score(MetricWithoutOtl['actual_labels'], MetricWithoutOtl['predicted_labels']))
    P2_AMI_Score = (adjusted_mutual_info_score(MetricWithoutOtl['actual_labels'], MetricWithoutOtl['predicted_labels']))

    # Prints the results' summary
    print('  *| Summary of Phase2 clustering results: ({} Clus. | {} Outl. | Homg.:{:.2%} | Comp.:{:.2%} | V-measure:{:.2%} | AR:{:.2%} | AMI:{:.2%})' .format(len(MetricWithoutOtl['predicted_labels'].unique()), list(predicted_labels).count(-1), P2_hom_com_vmet[0], P2_hom_com_vmet[1], P2_hom_com_vmet[2], P2_AR_Score, P2_AMI_Score))
    return clusters_details, outliers_details, P2_hom_com_vmet, P2_AR_Score, P2_AMI_Score
def evaluate_clustering_performance(clusters, labels):
    set_of_dimensionality = set()
    for cluster in clusters:
        set_of_dimensionality.add(frozenset(cluster.dimensions))

    # Evaluating performance in all dimensionality
    for dim in set_of_dimensionality:
        print("\nEvaluating clusters in dimension: ", list(dim))
        # Finding clusters with same dimensions
        clusters_in_dim = []
        for c in clusters:
            if c.dimensions == dim:
                clusters_in_dim.append(c)
        clustering_labels = np.zeros(np.shape(labels))
        for i, c in enumerate(clusters_in_dim):
            clustering_labels[list(c.data_point_ids)] = i + 1

        print("Number of clusters: ", len(clusters_in_dim))
        print("Adjusted Rand index: ",
              metrics.adjusted_rand_score(labels, clustering_labels))
        print("Mutual Information: ",
              metrics.adjusted_mutual_info_score(labels, clustering_labels))

        print(
            "Homogeneity, completeness, V-measure: ",
            metrics.homogeneity_completeness_v_measure(labels,
                                                       clustering_labels))

        print("Fowlkes-Mallows: ",
              metrics.fowlkes_mallows_score(labels, clustering_labels))
示例#27
0
    def get_homogeneity_completeness_vmeasure(standard_file, prediction_file):
        """Get homogeneity, completeness, and V-measure score [Rosenberg2007]_.

        Parameters
        ----------
        standard_file   : str
            The ground truth or standard filename.
        prediction_file : str
            The analyzed or predicted filename.

        Returns
        -------
        homogeneity_completeness_vmeasure   : tuple
            Homogeneity, completeness, and V-measure score

        References
        ----------
        .. [Rosenberg2007] Andrew Rosenberg and Julia Hirschberg. V-Measure: A conditional entropy-based
                           external cluster evaluation measure. In Proceedings of the 2007 Joint Conference on
                           Empirical Methods in Natural Language Processing and Computational
                           Natural Language Learning, volume 7, pages 410-420, 2007.
        """
        standard_labels = ExternalEvaluation.get_evaluated(standard_file)
        prediction_labels = ExternalEvaluation.get_evaluated(prediction_file)
        homogeneity_completeness_vmeasure = \
            metrics.homogeneity_completeness_v_measure(standard_labels, prediction_labels)

        return homogeneity_completeness_vmeasure
示例#28
0
def evaluate_recurrent_defects(ref_df: pd.DataFrame,
                               predictions,
                               remove_ata_zero_section=True):
    """
    Uses sklearn's Adjusted Rand Index, homogeneity, completeness and v-measure
    to evaluate the clustering predictions.

    https://scikit-learn.org/stable/modules/generated/sklearn.metrics.adjusted_rand_score.html
    https://scikit-learn.org/stable/modules/generated/sklearn.metrics.homogeneity_score.html
    https://scikit-learn.org/stable/modules/generated/sklearn.metrics.completeness_score.html
    https://scikit-learn.org/stable/modules/generated/sklearn.metrics.v_measure_score.html

    :param ref_df: The reference dataframe.
    :param predictions: The predictions. Their format is an iterable collection of sets of defect labels belonging to
                        the same cluster, i.e.
                        [{'C-6414274-1', 'L-5245081-1'}, {'C-6414294-1', 'C-6414295-1', 'C-6414296-1'}, ...]
                        Clusters containing a single element are ignored during evaluation.
    :param remove_ata_zero_section: Remove from the reference all clusters for which the ATA section is 0 (recommended)
    :return: A dict with the following keys
        ari_score - Adjusted Rand Index, similarity score between -1.0 and 1.0. Random labelings have an ARI close to 0.
                                         1.0 stands for perfect match.
        homogeneity - A clustering result satisfies homogeneity if all of its predicted clusters contain only data
                      points that are clustered in the reference.
        completeness - A clustering result satisfies completeness if all the data points that are members of the
                       same reference cluster are found in the same predicted cluster.
        v_measure - harmonic mean of homogeneity and completeness
        pred_clusters - a list of predicted cluster labels, useful for debug
        ref_clusters - a list of reference cluster labels, useful for debug
        remove_ata_zero_section - copy of argument remove_ata_zero_section for this function
    """

    filled_df = ref_df.recurrent.fillna(
        NO_CLUSTER_LABEL
    )  # when there is no recurrent id, define as not clustered

    if remove_ata_zero_section:
        filled_df.where(ref_df.section == 0, NO_CLUSTER_LABEL, inplace=True)

    # remove clusters with a single member, which are not clusters at all
    duplicate_df = filled_df.duplicated(keep=False)
    filled_df.where(duplicate_df, NO_CLUSTER_LABEL, inplace=True)
    ref_clusters = filled_df

    # convert cluster assignments from the predictions in the same order as those from the ref
    pred_clusters = convert_cluster_labels_to_seq(ref_df, predictions)

    # evaluate
    homogeneity, completeness, v_measure_score = homogeneity_completeness_v_measure(
        ref_clusters, pred_clusters)
    ari_score = adjusted_rand_score(ref_clusters, pred_clusters)

    return {
        'ari_score': ari_score,
        'homogeneity': homogeneity,
        'completeness': completeness,
        'v_measure': v_measure_score,
        'pred_clusters': pred_clusters,
        'ref_clusters': ref_clusters,
        'remove_ata_zero_section': remove_ata_zero_section
    }
示例#29
0
    def get_homogeneity_completeness_vmeasure(standard_file, prediction_file):
        """Get homogeneity, completeness, and V-measure score [Rosenberg2007]_.

        Parameters
        ----------
        standard_file   : str
            The ground truth or standard filename.
        prediction_file : str
            The analyzed or predicted filename.

        Returns
        -------
        homogeneity_completeness_vmeasure   : tuple
            Homogeneity, completeness, and V-measure score

        References
        ----------
        .. [Rosenberg2007] Andrew Rosenberg and Julia Hirschberg. V-Measure: A conditional entropy-based
                           external cluster evaluation measure. In Proceedings of the 2007 Joint Conference on
                           Empirical Methods in Natural Language Processing and Computational
                           Natural Language Learning, volume 7, pages 410-420, 2007.
        """
        standard_labels = ExternalEvaluation.get_evaluated(standard_file)
        prediction_labels = ExternalEvaluation.get_evaluated(prediction_file)
        homogeneity_completeness_vmeasure = \
            metrics.homogeneity_completeness_v_measure(standard_labels, prediction_labels)

        return homogeneity_completeness_vmeasure
示例#30
0
 def test_homogeneity_completeness_vmeasure(self):
     labels_true, labels_pred = _linearize(self.labels_true, self.labels_pred)
     sk_homogeneity, sk_completeness, sk_vmeasure = skmetrics.homogeneity_completeness_v_measure(labels_true,
                                                                                                 labels_pred)
     homogeneity, completeness, vmeasure = self.metrics._homogeneity_completeness_vmeasure(1)
     self.assertEqual(homogeneity, sk_homogeneity)
     self.assertEqual(completeness, sk_completeness)
     self.assertEqual(sk_vmeasure, vmeasure)
示例#31
0
def computeExternalMetrics(labels, predLabels) -> ExternalClusterMetrics:
    """External metrics evaluate clustering performance against labeled
    data."""
    ami = metrics.adjusted_mutual_info_score(labels, predLabels)
    ars = metrics.adjusted_rand_score(labels, predLabels)
    fm = metrics.fowlkes_mallows_score(labels, predLabels)
    h, c, v = metrics.homogeneity_completeness_v_measure(labels, predLabels)
    return ExternalClusterMetrics(ami, ars, c, fm, h, v)
示例#32
0
 def meanShift(self,X,axis2):
     ms=MeanShift(bandwidth=7)#带宽
     ms.fit(X)
     pred_ms=ms.labels_
     axis2.scatter(X[:,0],X[:,1],c=pred_ms,cmap='prism')
     axis2.set_title('mean-shift',fontsize=40)
     print('mean-shift:',np.unique(ms.labels_))
     print('mean-shift:',homogeneity_completeness_v_measure(self.labels,pred_ms))
示例#33
0
def print_results(true_labels, pred_labels, num_clusters):
    (h, c, v) =  metrics.homogeneity_completeness_v_measure(true_labels, pred_labels)

    print "#Topics=%s (%s). v-measure: %0.3f. h**o: %0.3f. comp: %0.3f. MI: %0.3f. NMI: %0.3f. Acc: %0.3f" \
      % (num_clusters, len(pred_labels), v, h, c,
        metrics.mutual_info_score(true_labels, pred_labels),
        metrics.normalized_mutual_info_score(true_labels, pred_labels),
        metrics.accuracy_score(true_labels, pred_labels))
示例#34
0
def evaluate_clusters(true_labels, pred_labels, technique):
    homog, compl, v_measure = homogeneity_completeness_v_measure(
        true_labels, pred_labels)

    print('Clustering Evaluation of', technique)
    print('    Homogeneity: ', homog)
    print('    Completeness:', compl)
    print('    V-Measure:   ', v_measure)
示例#35
0
def main(argv):

	print("Usage: python LFDassignment3_KMextra_Group10.py <C50trainset> <C50testset>")

	print('Reading Data...')
	# define train and test set
	# shuffle data
	train = read_corpus(sys.argv[1])
	test = read_corpus(sys.argv[2])
	random.shuffle(train)
	random.shuffle(test)
	# only use a part of the test data
	split_point = int(0.10*len(test))
	test = test[:split_point]
	Xtrain = [i[0] for i in train]
	Xtest = [i[0] for i in test]
	Ytrain = [i[1] for i in train]
	Ytest = [i[1] for i in test]

	tfidf = True

	# TdifdVectorizer with additional features used for classification
	# I used only stopwords
	if tfidf:
		vec = TfidfVectorizer(ngram_range=(1,3), analyzer='word', preprocessor = preprocessor,
							  tokenizer = identity,
							  stop_words = 'english',
							  lowercase = True)
	else:
		vec = CountVectorizer(ngram_range=(1,3), analyzer='word', preprocessor = preprocessor,
							  tokenizer = identity)

	# define the Support Vector Model with a linear kernel
	'''clf = svm.SVC(kernel='linear', C=1)'''
	# define the Kmeans classifier with 50 cluster
	clf = KMeans(n_clusters=50, random_state=1000, n_init=1, verbose=0)
	classifier = Pipeline([('vec', vec), ('cls', clf)])

	print('Training Classifier...')
	# train the classifier with features and their labels
	classifier.fit(Xtrain,Ytrain)

	print('Predicting Test Values...')
	# predict values of Xtest
	Yguess = classifier.predict(Xtest)

	# calculate the accuracy scores for the SVM classifier
	'''accuracy = accuracy_score(Ytest, Yguess)
	print(('Accuracy:', accuracy))'''
	print('-'*40)

	# calculate accuracy for the Kmeans classifier
	try:
		print(classifier.labels_)
	except:
		pass
	print(adjusted_rand_score(Ytest,Yguess))
	print(homogeneity_completeness_v_measure(Ytest,Yguess))
def k_means_results(name, A, B, x_label, y_label, colormap):
    X = A[0]
    y = A[1]
    X_test = B[0]
    y_test = B[1]
    h = .02
    n_clusters = 2
    k_means = KMeans(n_clusters=n_clusters)
    start = time.time()
    fit_results = k_means.fit(X)
    end = time.time()
    print 'Fit Time: ' + str(end - start)
    Y_kmeans = k_means.predict(X)
    ld.save_data('datasets/' + name.replace(' ', '_') + '_train.csv', [Y_kmeans,y])
    # print Y_kmeans
    figure_identifier = plt.figure()
    colors = ['yellow', 'cyan']
    if colormap:
        cmap_light = ListedColormap(['#FF3EFA', '#AAFFAA'])
        x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
        y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                             np.arange(y_min, y_max, h))
        Z = k_means.predict(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)
        plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
    for i in xrange(len(colors)):
        px = X[:, 0][Y_kmeans == i]
        py = X[:, 1][Y_kmeans == i]
        plt.scatter(px, py, c=colors[i])
    plt.scatter(fit_results.cluster_centers_[0, 0:1],fit_results.cluster_centers_[0, 1:2] , s=100, linewidths=4, c='orange', marker='x')
    plt.scatter(fit_results.cluster_centers_[1, 0:1],fit_results.cluster_centers_[1, 1:2] , s=100, linewidths=4, c='orange', marker='x')
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(name + ' Train Results')
#    plt.show()
    plt.savefig('figures/' + name.replace(' ', '_') + '_Training_results.png')
    figure_identifier.clf()
    plt.close(figure_identifier)

    y_pred = Y_kmeans
    y_true = y

    print 'Accuracy Score'
    print metrics.accuracy_score(y_true, y_pred)
    print 'Classification Report'
    print metrics.classification_report(y_true, y_pred)
    print 'Confusion Matrix'
    print metrics.confusion_matrix(y_true, y_pred)
    print 'Completeness Score'
    print metrics.completeness_score(y_true,y_pred)
    print 'Homogeneity Score'
    print metrics.homogeneity_score(y_true,y_pred)
    print 'Homogeneity Completeness V Measured'
    print metrics.homogeneity_completeness_v_measure(y_true,y_pred)
    print 'Mutual Information Score'
    print metrics.mutual_info_score(y_true,y_pred)
    print 'Normalized Mutual Info Score'
    print metrics.normalized_mutual_info_score(y_true,y_pred)
    print 'Silhouette Score'
    print metrics.silhouette_score(X,fit_results.labels_)
    print 'Silhouette Samples'
    print metrics.silhouette_samples(X,fit_results.labels_)
    print 'V Measure Score'
    print metrics.v_measure_score(y_true,y_pred)

    print_confusion_matrix('Train', Y_kmeans, y)
    figure_identifier = plt.figure()
    Y_kmeans = k_means.predict(X_test)
    ld.save_data('datasets/' + name.replace(' ', '_') + '_test.csv', [Y_kmeans,y_test])
    colors = ['yellow', 'cyan']
    if colormap:
        cmap_light = ListedColormap(['#FF3EFA', '#AAFFAA'])
        x_min, x_max = X_test[:, 0].min() - 1, X_test[:, 0].max() + 1
        y_min, y_max = X_test[:, 1].min() - 1, X_test[:, 1].max() + 1
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                             np.arange(y_min, y_max, h))
        Z = k_means.predict(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)
        plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
    for i in xrange(len(colors)):
        px = X_test[:, 0][Y_kmeans == i]
        py = X_test[:, 1][Y_kmeans == i]
        plt.scatter(px, py, c=colors[i])
    plt.scatter(fit_results.cluster_centers_[0, 0:1],fit_results.cluster_centers_[0, 1:2] , s=100, linewidths=4, c='orange', marker='x')
    plt.scatter(fit_results.cluster_centers_[1, 0:1],fit_results.cluster_centers_[1, 1:2] , s=100, linewidths=4, c='orange', marker='x')
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(name + ' Test Results')
#    plt.show()
    plt.savefig('figures/' + name.replace(' ', '_') + '_Test_results.png')
    print_confusion_matrix('Test', Y_kmeans, y_test)    
    figure_identifier.clf()
    plt.close(figure_identifier)
示例#37
0
def agglomerative_clustering(embedding_model_name, embedding_type, cluster_label_ground_truth_file,
                             cluster_n, method='ward', metric='euclidean', plot=False):
    embedding_file = 'data/{}/embeddings/{}_{}.npy'.format(dataset, embedding_model_name, embedding_type)
    embeddings = np.load(embedding_file)
    logger.info('Loaded embeddings from {}'.format(embedding_file))

    # Start clustering.
    logger.info('Start clustering ({}, {})...'.format(cluster_n, method))
    t0 = time()
    clustering = linkage(embeddings, method=method, metric=metric)
    logger.info('Clustering time: {}s'.format(time() - t0))

    embedding_labels = []
    embedding_label_file = 'data/{}/embeddings/{}_{}_labels.txt'.format(dataset, embedding_model_name, embedding_type)
    embedding_label_in = codecs.open(embedding_label_file)
    for row in embedding_label_in:
        if row:
            label = row.strip()
            if label:
                embedding_labels.append(label)
    embedding_label_in.close()

    cluster_label_prediction = fcluster(clustering, cluster_n, criterion='maxclust')    # 1-based index
    # logger.info('Cluster label prediction: {}'.format(cluster_label_prediction))
    clusters_agg = {}
    for i in xrange(len(cluster_label_prediction)):
        clusters_agg.setdefault(cluster_label_prediction[i] - 1, []).append(i)
    clustering_clusters_file = 'data/{}/clustering/{}_clusters.txt'.format(dataset, embedding_type)
    cluster_out = codecs.open(clustering_clusters_file, 'w')
    for i in xrange(len(clusters_agg)):
        cluster_out.write(u'{}\n'.format(','.join([embedding_labels[j] for j in clusters_agg[i]])))
    cluster_out.close()
    logger.info('Clustering labels saved at {}'.format(clustering_clusters_file))

    if cluster_label_ground_truth_file:
        # Read cluster label ground truth
        cluster_label_ground_truth = []
        with open(cluster_label_ground_truth_file) as f:
            for line in f:
                if line:
                    cluster_label_ground_truth.append(map(int, line.strip().split(',')))

        # Compute Ajusted Rand Index
        for i in xrange(len(cluster_label_ground_truth)):
            ari = metrics.adjusted_rand_score(cluster_label_ground_truth[i], cluster_label_prediction)
            logger.info('Ajusted Rand Index for cluster group {}: {}'.format(i, ari))
            ami = metrics.adjusted_mutual_info_score(cluster_label_ground_truth[i], cluster_label_prediction)
            logger.info('Ajusted Mutual Information Score for cluster group {}: {}'.format(i, ami))
            chv = metrics.homogeneity_completeness_v_measure(cluster_label_ground_truth[i], cluster_label_prediction)
            logger.info('V-measure score for cluster group {}: {}'.format(i, chv))

    # Compute Silhouette Coefficient
    t0 = time()
    sc_score = metrics.silhouette_score(embeddings, cluster_label_prediction, metric=metric)
    logger.info('Silhouette Coefficient: {}'.format(sc_score))
    logger.info('SC computation time: {}s'.format(time() - t0))

    if plot:
        plt.rc('lines', linewidth=2)
        plt.figure()
        plt.title('{} Clustering'.format('Relation'), fontsize=28)
        plt.yticks([])
        dendrogram(
            clustering,
            leaf_rotation=90.,  # rotates the x axis labels
            leaf_font_size=14.,  # font size for the x axis labels
            labels=embedding_labels
        )
        plt.gcf().subplots_adjust(bottom=0.25)
        plt.show()
        # plt.savefig('data/{}/{}_clustering_dendrogram.png'.format(dataset, type), dpi=300)

    return sc_score
def print_metrics(true_clustering, cluster):
    #try some metrics from sklearn
    print "\n"
    print "adjusted rand score [-1.0 (bad) to 1.0 (good)]\n", metrics.adjusted_rand_score(true_clustering, cluster)
    print "mutual information based score [0.0 (bad) to 1.0 (good)]\n", metrics.adjusted_mutual_info_score(true_clustering, cluster)
    print "homogeneity, completeness, v measure [0.0 (bad) to 1.0 (good)]\n", metrics.homogeneity_completeness_v_measure(true_clustering, cluster)
def main(self,argv=sys.argv):
#######   
    try:
        kernel =sys.argv[3] 
    except :# catch *all* exceptions
        kernel='rbf'
        
    print('Training data loading....')
    data = arff.load(open(argv[1],'rb'))
    labeled_set = data['data']
    train_set = np.asarray([fila[0:len(fila)-1] for fila in labeled_set])
    train_set_labels = np.asarray([fila[-1] for fila in labeled_set])
    
    atts = data['attributes']
    atts_names = [fila[0] for fila in atts]
    att_values = [fila [1] for fila in atts]
    labels = np.array(att_values[len(att_values)-1])
    
    print 'TRAIN DATA SHAPE'
    print train_set.shape
    print 'Attributes NUM'
    print len(atts_names)
    print 'LABELS FOR CLASS'
    print labels
    
    print('Develop data loading....')
    datadev_set = arff.load(open(argv[2],'rb'))
    dev_labeled_set = datadev_set['data']
    dev_set = np.asarray([fila[0:len(fila)-1] for fila in dev_labeled_set])
    dev_set_labels = np.asarray([fila[-1] for fila in dev_labeled_set])
    
    dev_atts = data['attributes']
    dev_atts_names = [fila[0] for fila in dev_atts]
    dev_att_values = [fila [1] for fila in dev_atts]
    dev_labels = np.array(dev_att_values[len(dev_att_values)-1])
    
    print 'DEV DATA SHAPE'
    print dev_set.shape
    print 'DEV Attributes NUM'
    print len(dev_atts_names)
    print 'LABELS FOR DEV CLASS'
    print dev_labels
    
####    
    print ('Preprocesing data...')
    # #    parse a un dict para poder vectorizar los att categoricos
    print ('Parsing categorical data...')

    dict_list = []
    N,F = train_set.shape
    for n in range(N):
        d = {}
        for f in range(F):
            feature = atts_names[f]
            d[feature] = train_set[n,f]
        dict_list.append(d)
        
    dev_dict_list = []
    N,F = dev_set.shape
    for n in range(N):
        d = {}
        for f in range(F):
            feature = dev_atts_names[f]
            d[feature] = dev_set[n,f]
        dev_dict_list.append(d)
    
    
    #Fit vectorizer for each dict
    v = DictVectorizer(sparse=False,dtype=np.float16)
    
    v_train_set = v.fit_transform(dict_list[0])
    for i in range(1,len(dict_list)):
        train_set_instance = v.fit_transform(dict_list[i])
        v_train_set = np.vstack((v_train_set,train_set_instance))
    
    v_dev_set = v.fit_transform(dev_dict_list[0])
    for j in range(1,len(dev_dict_list)):   
        v_dev_set_instance = v.fit_transform(dev_dict_list[j])        
        v_dev_set = np.vstack((v_dev_set,v_dev_set_instance))
       
    v_train_set = np.asarray(v_train_set)
    v_dev_set = np.asarray(v_dev_set)
            
    # # transform non-numerical labels to numerical
    le = preprocessing.LabelEncoder()    
    le.fit(train_set_labels)
    train_numeric_labels = le.transform(train_set_labels)
    le.fit(dev_set_labels)
    dev_numeric_labels = le.transform(dev_set_labels)
#########    
#     print ('Fitting the model')
#     #Fit the model
#     model = svm.SVC(kernel='rbf', gamma=2, C=1, degree=0).fit(v_train_set, train_numeric_labels, sample_weight=None)
#     print "Making predictions..."
#     expected=dev_numeric_labels
#     predicted = model.predict(v_dev_set)
#     print "Making Hold out evaluation with dev set..."
#     f1Aux = metrics.f1_score(expected, predicted, pos_label=0)
#     print ("New F1Score = %r" %f1Aux)
#     print(metrics.classification_report(expected, predicted, labels=None))
##########
    cBest = 0.
    gBest = 0.  
    dBest = 0.          
    print('Start scaning data for Polinomial kernel....')
    f1Aux=0.0
    f1Best=0.0
    if kernel=='rbf':
        maxD=3
    else:
        maxD=5
    for d in range(2,maxD):#2,5
        for i in range(-15,12):#-15,12
            c=2**i
            for j in range(-3,5):#-3,5
                g=2**j
                print("Hyperparameters: coef0 = %r gamma = %r degree = %d...." %(c,g,d))
                #   fit the model 
             
                model = svm.SVC(kernel=kernel, gamma=g, coef0=c, degree=d, class_weight='auto').fit(v_train_set, train_numeric_labels, sample_weight=None)  
                #     make predictions
                print "Making predictions..."
                expected=dev_numeric_labels
                predicted = model.predict(v_dev_set)
                print "Making Hold out evaluation with dev set..."
                f1Aux = metrics.f1_score(expected, predicted, pos_label=0)
                print ("New F1Score = %r" %f1Aux)
                if f1Aux>f1Best:
                    print ("Maximun F1Score = %r" %f1Aux)
                    f1Best=f1Aux    
                    print('Hyperparameters has been changed New degree = %d New coef0= %r New gamma = %r ' %(d,c,g))
                    cBest = c
                    gBest = g  
                    dBest = d          
    # summarize the fit of the model
    print('Optimized hyperparameters from %s kernel are : coef0 = %r gamma = %r degree = %d'%(kernel,cBest, gBest, dBest))
    #Concat train+dev
    X_all = np.vstack((v_train_set, v_dev_set))
    expected_all = np.concatenate((train_numeric_labels,dev_numeric_labels), axis=0)
    
    print('Start Dis-honest evaluation with train for test')
    model = svm.SVC(kernel='rbf', gamma=gBest, coef0=cBest, degree=dBest).fit(v_train_set, train_numeric_labels, sample_weight=None)   
    predicted = model.predict(v_train_set) 
    print(metrics.classification_report(train_numeric_labels, predicted, labels=None))
    
    print('Start Hold-Hout evaluation with train ,dev for test')
    model = svm.SVC(kernel='rbf', gamma=gBest, coef0=cBest, degree=dBest).fit(v_train_set, train_numeric_labels, sample_weight=None)   
    predicted = model.predict(v_dev_set) 
                #     make predictions
    print(metrics.classification_report(dev_numeric_labels, predicted, labels=None))
    print(metrics.confusion_matrix(dev_numeric_labels, predicted))
    print
    print(metrics.f1_score(dev_numeric_labels, predicted, pos_label=0))        
    print(metrics.homogeneity_completeness_v_measure(dev_numeric_labels, predicted))
     
    print "Making 10-FCV with train+dev..."
    scores = cs.cross_val_score(model, X_all, expected_all, metrics.f1_score, cv=10, n_jobs=-1, verbose=True)
    print("F1score weighted: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    scores = cs.cross_val_score(model, X_all, expected_all, metrics.classification_report, cv=10, n_jobs=-1, verbose=True)
    for score in scores:
        print(score)
    
    if not os.path.isdir('Modelos'):
        os.mkdir('Modelos')
    date = time.strftime("%H%M%d%m%Y")
    jl.dump(model, 'Modelos/CSVM'+kernel+date+'.pkl') 
    
    return model
示例#40
0
def compare_clusters(prog, argv):
    parser = argparse.ArgumentParser(prog=prog,
                                     description='Compare Equivalence Classes')
    parser.add_argument('classes', metavar='eqclass', type=str, nargs=2,
                        help='Ground Truth / Prediction')
    parser.add_argument('-ar', action='store_true', default=False,
                        help='Adjusted rand score')
    parser.add_argument('-mi', action='store_true', default=False,
                        help='Mutual info score')
    parser.add_argument('-ami', action='store_true', default=False,
                        help='Adjusted mutual info score')
    parser.add_argument('-nmi', action='store_true', default=False,
                        help='Normalised mutual info score')
    parser.add_argument('-pur', action='store_true', default=False,
                        help='Purity')
    parser.add_argument('-pr', action='store_true', default=False,
                        help='Classic Precision/Recall')
    parser.add_argument('-fm', action='store_true', default=False,
                        help='Fowlkes-Mallow score')
    parser.add_argument('-remove-identical', action='store_true', default=False,
                        help='Remove identical clusters before comparing')
    parser.add_argument('-f', type=str, help='Write results to filename')
    parser.add_argument('-test', action='store_true', default=False,
                        help='run tests')

    args = parser.parse_args(argv)

    # These are the converted example from:
    # https://nlp.stanford.edu/IR-book/html/htmledition/evaluation-of-clustering-1.html

    if args.test:
        ground_truth = Cluster()
        prediction = Cluster()

        prediction.insert(0,1,2,3,4,5)
        prediction.insert(6,7,8,9,10,11)
        prediction.insert(12, 13, 14, 15, 16)

        ground_truth.insert(0,2,3,4,5,6,12,14)
        ground_truth.insert(1, 7, 8, 9, 11)
        ground_truth.insert(10,13,15,16)
    else:
        ground_truth = Cluster.from_file(args.classes[0], must_exist=True)
        prediction = Cluster.from_file(args.classes[1], must_exist=True)

    # remove identical clusters, if desired
    if args.remove_identical:
        identical = list()
        for cluster in ground_truth:
            cand = prediction.get_cluster(list(cluster)[0])
            if not cand:
                continue
            elif cand == cluster:
                identical.append(cand)
        log.info('Removing %d identical clusters (%d elements)...' %
                 (len(identical), sum([len(x) for x in identical])))
        for cluster in identical:
            for element in cluster:
                ground_truth.remove_key(element)
                prediction.remove_key(element)
        ground_truth.optimize()
        prediction.optimize()

    if (args.pr):
        prec_rec(ground_truth, prediction)

    # intermix all keys
    ground_truth_keys = ground_truth.get_keys()
    prediction_keys = prediction.get_keys()

    missing = ground_truth_keys - prediction_keys
    log.info('%d keys missing in prediction' % len(missing))
    for key in missing:
        prediction.insert_single(key)

    missing = prediction_keys - ground_truth_keys
    log.info('%d keys missing in ground truth' % len(missing))
    for key in missing:
        ground_truth.insert_single(key)

    gt = list(sorted(ground_truth.lookup.items()))
    t = list(sorted(prediction.lookup.items()))

    gt = [x[1] for x in gt]
    t = [x[1] for x in t]

    log.info('Number of equiv classes: %d' % len(ground_truth))

    h**o, comp, vm = metrics.homogeneity_completeness_v_measure(gt, t)
    log.info("Homogeneity: %0.3f" % h**o)
    log.info("Completeness: %0.3f" % comp)
    log.info("V-measure: %0.3f" % vm)

    if args.ar:
        ar = metrics.adjusted_rand_score(gt, t)
        log.info("Adjusted rand score: %0.3f" % ar)
    if args.mi:
        mi = metrics.mutual_info_score(gt, t)
        log.info("Mutual info score: %0.3f" % mi)
    if args.ami:
        ami = metrics.adjusted_mutual_info_score(gt, t)
        log.info("Adjusted mutual info score: %0.3f" % ami)
    if args.nmi:
        nmi = metrics.normalized_mutual_info_score(gt, t)
        log.info("Normalised mutual info score: %0.3f" % nmi)
    if args.pur:
        elements = len(gt)
        hits = 0
        for w in ground_truth:
            this = 0
            for element in w:
                tmp = prediction[element]
                foo = len(w & tmp)
                if foo > this:
                    this = foo
            hits += this
        purity = hits / elements
        log.info('Purity: %0.3f' % purity)
    if args.fm:
        fm = metrics.fowlkes_mallows_score(gt, t)
        log.info("Fowlkes-Mallows score: %0.3f" % fm)

    if args.f:
        with open(args.f, 'w') as f:
            f.write("h**o: %0.3f\n" % h**o)
            f.write("comp: %0.3f\n" % comp)
            f.write("vm: %0.3f\n" % vm)
            if args.ar:
                f.write("ar: %0.3f\n" % ar)
            if args.mi:
                f.write("mi: %0.3f\n" % mi)
            if args.nmi:
                f.write("nmi: %0.3f\n" % nmi)
            if args.ami:
                f.write("ami: %0.3f\n" % ami)
            if args.pur:
                f.write("pur: %0.3f\n" % purity)
            if args.fm:
                f.write("fm: %0.3f\n" % fm)

    return 0
示例#41
0
	max_features=n_feathers,
	stop_words=stop_word,
	use_idf=True
)


# vectorizer = Pipeline((
# 	('hasher', hasher),
# 	('tf_idf', TfidfTransformer())
# ))

X = vectorizer.fit_transform(twenty_train.data)
labels = twenty_train.target
true_k = np.unique(labels).shape[0]


km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1, verbose=False, random_state=RandomState(42))
X_kmean = km.fit(X)
# print km.cluster_centers_
print '##########################'


air_score = metrics.adjusted_rand_score(twenty_train.target, km.labels_)
all_three_score = metrics.homogeneity_completeness_v_measure(twenty_train.target, km.labels_)
print air_score
print all_three_score
print metrics.silhouette_score(X, km.labels_, metric='euclidean')

# 0.172990728537
# (0.23396974800824874, 0.34894426413758112, 0.28011816442240145)
# 0.00810690704347
def homogeneity_completeness( (x, y) ):
    return list(metrics.homogeneity_completeness_v_measure(x, y))
pp2= ax.scatter(c2[:,0], c2[:,1],cmap='prism',s=50,color='g')
ax.legend((pp1,pp2),('class 1', 'class2'),fontsize=35)
fig.savefig('classes.png')


#start figure
fig.clf()#reset plt
fig, ((axis1, axis2), (axis3, axis4)) = plt.subplots(2, 2, sharex='col', sharey='row')

#k-means
kmeans = KMeans(n_clusters=2)
kmeans.fit(X)
pred_kmeans = kmeans.labels_
#axis1 = fig.add_subplot(211)
print 'kmeans:',np.unique(kmeans.labels_)
print 'kmeans:',homogeneity_completeness_v_measure(labels,pred_kmeans)
plt.scatter(X[:,0], X[:,1], c=kmeans.labels_, cmap='prism')  # plot points with cluster dependent colors
axis1.scatter(X[:,0], X[:,1], c=kmeans.labels_, cmap='prism')
#axis1.set_xlabel('x',fontsize=40)
axis1.set_ylabel('y',fontsize=40)
axis1.set_title('k-means',fontsize=20)
#plt.show()


#mean-shift
ms = MeanShift(bandwidth=7)
ms.fit(X)
pred_ms = ms.labels_
axis2.scatter(X[:,0], X[:,1], c=pred_ms, cmap='prism')
axis2.set_title('mean-shift',fontsize=20)
示例#44
0
文件: test.py 项目: jy00295005/ML
X_tfidf = transformer.fit_transform(X_counts)
# print vectorizer.get_feature_names()
print len(vectorizer.get_feature_names())
km = KMeans(n_clusters=3, init='k-means++', max_iter=800, n_init=200)
X_kmean = km.fit(X_tfidf)
X_kmean_r = X_kmean.transform(X_tfidf)

# print(X_kmean_r)
# print X_kmean.labels_
# print X_kmean.labels_
# print km.labels_
# print np.asarray(myLabel, dtype=np.int)
# print km.labels_

air_score = metrics.adjusted_rand_score(myLabel, km.labels_)
all_three_score = metrics.homogeneity_completeness_v_measure(myLabel, km.labels_)

print "ARI 计算真实与预测的结果相似度: %s" % air_score  #相似度
print "Mutual Information based scores 使用labels_true和labels_pred 来计算之间的一致性: %s" % metrics.adjusted_mutual_info_score(myLabel, km.labels_)
# print ''
print "Homogeneity 同质性 每个簇中的成员只包含唯一个类型: %s" % all_three_score[0]
print "completeness 完整性 一个类型中得全部成员都被分配到同一个簇中: %s" % all_three_score[1]
print "V-measure 相等于上面的NIMI的标签熵之和的归一化 normalized by sum of label entropies: %s" % all_three_score[2]
print "Silhouette Coefficient 轮廓系数: %s" % metrics.silhouette_score(X_tfidf, np.asarray(myLabel, dtype=np.int), metric='euclidean')

fig, ax = pl.subplots()
for c, i, in zip("rgb", [0, 1, 2]):
	pl.scatter(X_kmean_r[np.asarray(X_kmean.labels_) == i, 0], X_kmean_r[np.asarray(X_kmean.labels_) == i, 1], c=c,label='Dimension 1 vs Dimension2')
	ax.set_xlabel('Dimension 0 ')
	ax.set_ylabel('Dimension 1 ')
	ax.set_title('term words scatter plot of 3 cluster Dimension 0 vs Dimension 1')
def compareClustering(groundTruth,modelCluster):
	print "adjusted random score (different from assigning random classes?)= ",metrics.adjusted_rand_score(groundTruthCustering,modelCluster)
	print "adjusted mutual Information based scores (tends to increase with number of clusters)= ",metrics.adjusted_mutual_info_score(groundTruthCustering,modelCluster)
	print "homogenity, completeness, v-measure scores = ",metrics.homogeneity_completeness_v_measure(groundTruthCustering,modelCluster)
示例#46
0
        c_inds = np.where(clusters[k] == i)

        cluster_treatment_label = (
            1 if df["treatment_label"].values[c_inds].sum() / len(df["treatment_label"].values[c_inds]) >= 0.5 else 0
        )
        pred_treatment_labels[k][c_inds] = cluster_treatment_label

        cluster_infection_label = (
            1
            if df["case_control_label"].values[c_inds].sum() / len(df["case_control_label"].values[c_inds]) >= 0.5
            else 0
        )
        pred_infection_labels[k][c_inds] = cluster_infection_label

    cluster_treatment_stats[k] = metrics.homogeneity_completeness_v_measure(
        df["treatment_label"].values, pred_treatment_labels[k]
    )
    cluster_infection_stats[k] = metrics.homogeneity_completeness_v_measure(
        df["case_control_label"].values, pred_treatment_labels[k]
    )


# compute one way anova over clustering solutions:
for p in ["Vic_HA", "Vic_NA"]:
    for assay in Vic_assays:
        group_samples = {}
        for i in arange(1, num_clusters + 1):
            group_samples[i] = np.asarray(df[assay].loc[clusters[p] == i])
            group_samples[i] = group_samples[i][~np.isnan(group_samples[i])]
        (F, p_anova) = scipy.stats.f_oneway(*group_samples.values())
        print(p, assay, F, p_anova)
示例#47
0
classifier = Pipeline([('vec',vec),('cls', km)])
classifier.fit(X)
Yguess = classifier.predict(X)


labelDict = {}
clusterCombos = defaultdict(list)
for pred, gold in zip(Yguess, Y):
	clusterCombos[pred].append(gold)
for pred, gold in clusterCombos.items():
	labelDict[pred]=Counter(gold).most_common(1)[0][0]
predList = [labelDict[label] for label in Yguess]

print("Rand index: {}".format(adjusted_rand_score(Y,Yguess)))
print("V-measure: {}".format(v_measure_score(Y,Yguess)))
print("All three: {}".format(homogeneity_completeness_v_measure(Y,Yguess)))

cm=confusion_matrix(Y, predList, labels=list(set(Y)))
print(cm)

plt.figure()
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion Matrix of binary label K-Means classification')
plt.colorbar()
tick_marks = numpy.arange(len(list(set(Y))))
plt.xticks(tick_marks, list(set(Y)), rotation=45)
plt.yticks(tick_marks, list(set(Y)))
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
	for feature in actor['featureVectors']:
		featureList.append(feature)

k_means.fit(featureList)

groundTruthCuster=[]
i=0
for actor in featureListFromFile['features']:
	for feature in actor['featureVectors']:
		groundTruthCuster.append(i)
	i=i+1

modelCluster=[]	

for actor in featureListFromFile['features']:
	for feature in actor['featureVectors']:
		cluster=k_means.predict(feature)
		for c in cluster:
			modelCluster.append(c)

print "adjusted random index (different from assigning random classes?)= ",metrics.adjusted_rand_score(groundTruthCuster,modelCluster)
print "adjusted mutual Information based scores (tends to increase with number of clusters)= ",metrics.adjusted_mutual_info_score(groundTruthCuster,modelCluster)
print "homogenity, completeness, v-measure scores = ",metrics.homogeneity_completeness_v_measure(groundTruthCuster,modelCluster)







示例#49
0
def k_means_results(name, A, B, x_label, y_label, colormap):
    X = A[0]
    y = A[1]
    X_test = B[0]
    y_test = B[1]
    h = 0.02
    n_clusters = 2
    k_means = KMeans(n_clusters=n_clusters)
    start = time.time()
    fit_results = k_means.fit(X)
    end = time.time()
    print "Fit Time: " + str(end - start)
    Y_kmeans = k_means.predict(X)

    y_pred = Y_kmeans
    y_true = y

    print "Train Accuracy Score Default"
    print metrics.accuracy_score(y_true, y_pred)
    y_pred = map(flip, Y_kmeans)
    print "Train Accuracy Score Flip Labels"
    print metrics.accuracy_score(y_true, y_pred)
    print "Classification Report"
    print metrics.classification_report(y_true, y_pred)
    print "Confusion Matrix"
    print metrics.confusion_matrix(y_true, y_pred)
    print "Completeness Score"
    print metrics.completeness_score(y_true, y_pred)
    print "Homogeneity Score"
    print metrics.homogeneity_score(y_true, y_pred)
    print "Homogeneity Completeness V Measured"
    print metrics.homogeneity_completeness_v_measure(y_true, y_pred)
    print "Mutual Information Score"
    print metrics.mutual_info_score(y_true, y_pred)
    print "Normalized Mutual Info Score"
    print metrics.normalized_mutual_info_score(y_true, y_pred)
    print "Silhouette Score"
    print metrics.silhouette_score(X, fit_results.labels_)
    print "Silhouette Samples"
    print metrics.silhouette_samples(X, fit_results.labels_)
    print "V Measure Score"
    print metrics.v_measure_score(y_true, y_pred)

    figure_identifier = plt.figure()
    colors = ["yellow", "cyan"]
    if colormap:
        cmap_light = ListedColormap(["#FF3EFA", "#AAFFAA"])
        x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
        y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
        Z = k_means.predict(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)
        plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
    for i in xrange(len(colors)):
        px = X[:, 0][Y_kmeans == i]
        py = X[:, 1][Y_kmeans == i]
        plt.scatter(px, py, c=colors[i])
    plt.scatter(
        fit_results.cluster_centers_[0, 0:1],
        fit_results.cluster_centers_[0, 1:2],
        s=100,
        linewidths=4,
        c="orange",
        marker="x",
    )
    plt.scatter(
        fit_results.cluster_centers_[1, 0:1],
        fit_results.cluster_centers_[1, 1:2],
        s=100,
        linewidths=4,
        c="orange",
        marker="x",
    )
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(name + " Train Results")
    #    plt.show()
    plt.savefig("figures/" + name.replace(" ", "_") + "_Training_results.png")
    figure_identifier.clf()
    plt.close(figure_identifier)

    print_confusion_matrix("Train", Y_kmeans, y)
    figure_identifier = plt.figure()
    Y_kmeans = k_means.predict(X_test)
    y_pred = Y_kmeans
    y_true = y_test

    print "Test Accuracy Score Default"
    print metrics.accuracy_score(y_true, y_pred)
    y_pred = map(flip, Y_kmeans)
    print "Test Accuracy Score Flip Labels"
    print metrics.accuracy_score(y_true, y_pred)
    colors = ["yellow", "cyan"]
    if colormap:
        cmap_light = ListedColormap(["#FF3EFA", "#AAFFAA"])
        x_min, x_max = X_test[:, 0].min() - 1, X_test[:, 0].max() + 1
        y_min, y_max = X_test[:, 1].min() - 1, X_test[:, 1].max() + 1
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
        Z = k_means.predict(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)
        plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
    for i in xrange(len(colors)):
        px = X_test[:, 0][Y_kmeans == i]
        py = X_test[:, 1][Y_kmeans == i]
        plt.scatter(px, py, c=colors[i])
    plt.scatter(
        fit_results.cluster_centers_[0, 0:1],
        fit_results.cluster_centers_[0, 1:2],
        s=100,
        linewidths=4,
        c="orange",
        marker="x",
    )
    plt.scatter(
        fit_results.cluster_centers_[1, 0:1],
        fit_results.cluster_centers_[1, 1:2],
        s=100,
        linewidths=4,
        c="orange",
        marker="x",
    )
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(name + " Test Results")
    #    plt.show()
    plt.savefig("figures/" + name.replace(" ", "_") + "_Test_results.png")
    print_confusion_matrix("Test", Y_kmeans, y_test)
    figure_identifier.clf()
    plt.close(figure_identifier)
示例#50
0
 def test_homogeneity_completeness_v_measure(self):
     result = self.df.metrics.homogeneity_completeness_v_measure()
     expected = metrics.homogeneity_completeness_v_measure(self.target, self.pred)
     self.assertEqual(result, expected)
post_inds = time_dict['Post']
p_labels = np.unique(arr_df[post_inds].group_label.values)
for k in ind_dict.keys():
    # use Andrew's package which allows clustering using Spearman distances (sch.linkage, and pdist do not support this for some reason, unlike Matlab)
    (dMat[k], Z_struct[k], dend[k]) = hcp.computeHCluster(arr_df[post_inds][ind_dict[k]], method='complete', metric='spearman')
    clusters[k] = sch.fcluster(Z_struct[k], t=num_clusters, criterion='maxclust')

    # compute cluster homogeneity and completness (purity and accuracy) for treatment label and for infection status:
    pred_treatment_labels[k] = np.zeros(shape=(arr_df[post_inds].shape[0]))
    for i in np.arange(1, num_clusters+1):
        c_inds = np.where(clusters[k] == i)
        val, ind = scipy.stats.mode(arr_df[post_inds]['group_label'].values[c_inds])
        pred_treatment_labels[k][c_inds] = val[0]

    cluster_treatment_stats[k] = metrics.homogeneity_completeness_v_measure(arr_df[post_inds]['group_label'].values, pred_treatment_labels[k])

# compute pairwise statistics of clusters using alternate assays as values:
prot_stats = {}
for p in ['SHA_ha', 'SHA_na']:
    p_values = {assay: np.zeros(shape=(num_clusters, num_clusters)) for assay in assays}
    q_values = {assay: np.zeros(shape=(num_clusters, num_clusters)) for assay in assays}
    stats_df = pd.DataFrame()
  
    for assay in assays:
        res = []
        c_inds = []
        for i in np.arange(num_clusters):
            for j in np.arange(i+1, num_clusters):
                res.append(scipy.stats.ranksums(arr_df[assay].loc[clusters[p] == i+1], arr_df[assay].loc[clusters[p] == j+1]))
                c_inds.append((i+1, j+1))
示例#52
0
文件: kmeans.py 项目: butara/PPRM
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score, homogeneity_completeness_v_measure
import pylab

if len(sys.argv) < 3:
    sys.exit('Usage: python kmeans.py dataset k')

## Data preprocessing
data = parse_tab(sys.argv[1])
k = int(sys.argv[2])
classes = [example[-1] for example in data]
examples = data_to_na(data)

## Clustering
kmeans = KMeans(k=k, random_state=0)
kmeans.fit(examples)
codebook = kmeans.cluster_centers_
labels = kmeans.predict(examples)

## Performance evaluation
ari = adjusted_rand_score(labels, classes)
homogeneity, completeness, v_measure = homogeneity_completeness_v_measure(labels, classes)
print('ARI: {0}'.format(ari))
print('Homogeneity: {0}'.format(homogeneity))
print('Completeness: {0}'.format(completeness))
print('V-measure: {0}'.format(v_measure))

pylab.figure(1)
pylab.scatter(examples.T[0], examples.T[1], c=labels)
pylab.show()
# In[27]:


km2 = KMeans(n_clusters=2, random_state=42).fit(X)
km2_labels = km2.labels_

km5 = KMeans(n_clusters=5, random_state=42).fit(X)
km5_labels = km5.labels_


# ## Homogeneity, Completeness and V-measure

# In[28]:


km2_hcv = np.round(metrics.homogeneity_completeness_v_measure(y, km2_labels), 3)
km5_hcv = np.round(metrics.homogeneity_completeness_v_measure(y, km5_labels), 3)

print('Homogeneity, Completeness, V-measure metrics for num clusters=2: ', km2_hcv)
print('Homogeneity, Completeness, V-measure metrics for num clusters=5: ', km5_hcv)


# ## Silhouette Coefficient

# In[29]:


from sklearn import metrics

km2_silc = metrics.silhouette_score(X, km2_labels, metric='euclidean')
km5_silc = metrics.silhouette_score(X, km5_labels, metric='euclidean')
 def evaluateClusteringModel(self, predTarget):
     from sklearn.metrics import homogeneity_completeness_v_measure
     print('Clustering report--homogeneity, completeness, v-measure')
     print(homogeneity_completeness_v_measure(self.testTarget, predTarget))
示例#55
0
y_pred = k_means.predict(X) 

accuracy_score = []
#http://scikit-learn.org/stable/modules/classes.html
print 'Accuracy Score'
accuracy_score.append(metrics.accuracy_score(y_true, y_pred))
print 'Classification Report'
print metrics.classification_report(y_true, y_pred)
print 'Confusion Matrix'
print metrics.confusion_matrix(y_true, y_pred)
print 'Completeness Score'
print metrics.completeness_score(y_true,y_pred)
print 'Homogeneity Score'
print metrics.homogeneity_score(y_true,y_pred)
print 'Homogeneity Completeness V Measured'
print metrics.homogeneity_completeness_v_measure(y_true,y_pred)
print 'Mutual Information Score'
print metrics.mutual_info_score(y_true,y_pred)
print 'Normalized Mutual Info Score'
print metrics.normalized_mutual_info_score(y_true,y_pred)
print 'Silhouette Score'
print metrics.silhouette_score(X,result.labels_)
print 'Silhouette Samples'
print metrics.silhouette_samples(X,result.labels_)
print 'V Measure Score'
print metrics.v_measure_score(y_true,y_pred)


stdsc = StandardScaler()
X_scaled = stdsc.fit_transform(X)
k_means = KMeans(n_clusters=2)