Python fowlkes_mallows_score示例，sklearn.metrics.cluster.fowlkes_mallows_score Python示例

示例#1

0

显示文件

文件： porownanie.py 项目： aleksandramiesiac/pd3

def Porownaj_algorytmy(data, klasy, labels, method, baza):
    """
    Oblicza indeksy AM, AR i FM dla wszystkich algorytmów, aprócz napisanego przeze mnie.
    """

    wektor =[]
    test =[0]*len(method)
    i=0
    #algorytmy linkage
    for name in method:
        Z = linkage(data, name)
        test[i] = cluster.hierarchy.cut_tree(Z,klasy)
        test[i] = [y for x in test[i] for y in x]
        wektor.append([fowlkes_mallows_score(labels,test[i]), adjusted_mutual_info_score(labels, test[i]),adjusted_rand_score(labels,test[i]),baza ])
        i+=1
    # algorytm genieclust
    wynikMG = genieclust.genie.Genie(n_clusters=klasy).fit_predict(data)
    wektor.append([fowlkes_mallows_score(labels,wynikMG), adjusted_mutual_info_score(labels, wynikMG),adjusted_rand_score(labels,wynikMG),baza ])
    
    #MeanShift
    wynikCL = MeanShift(bandwidth=klasy).fit(data).labels_
    wektor.append([fowlkes_mallows_score(labels,wynikCL), adjusted_mutual_info_score(labels, wynikCL),adjusted_rand_score(labels,wynikCL),baza ])
    
    #AgglomerativeClustering
    wynikFA = AgglomerativeClustering(n_clusters=klasy).fit(data).labels_
    wektor.append([fowlkes_mallows_score(labels,wynikFA), adjusted_mutual_info_score(labels, wynikFA),adjusted_rand_score(labels,wynikFA),baza ])
    
    #KMeans
    wynikKM = KMeans(n_clusters=klasy, random_state=123).fit(data).labels_
    wektor.append([fowlkes_mallows_score(labels,wynikKM), adjusted_mutual_info_score(labels, wynikKM),adjusted_rand_score(labels,wynikKM) ,baza])
    
    
    index = ["single",'complete','average','weighted','centroid','median','ward', "genieclust","AgglomerativeClustering","KMeans","MeanShift"]
    
    return pd.DataFrame(wektor, index = index, columns = ["FM","AM","AR", "Dane"])

示例#2

0

显示文件

文件： test_supervised.py 项目： antoinewdg/scikit-learn

def test_fowlkes_mallows_score():
    # General case
    score = fowlkes_mallows_score([0, 0, 0, 1, 1, 1], [0, 0, 1, 1, 2, 2])
    assert_almost_equal(score, 4.0 / np.sqrt(12.0 * 6.0))

    # Perfect match but where the label names changed
    perfect_score = fowlkes_mallows_score([0, 0, 0, 1, 1, 1], [1, 1, 1, 0, 0, 0])
    assert_almost_equal(perfect_score, 1.0)

    # Worst case
    worst_score = fowlkes_mallows_score([0, 0, 0, 0, 0, 0], [0, 1, 2, 3, 4, 5])
    assert_almost_equal(worst_score, 0.0)

示例#3

0

显示文件

def test_fowlkes_mallows_score():
    # General case
    score = fowlkes_mallows_score([0, 0, 0, 1, 1, 1], [0, 0, 1, 1, 2, 2])
    assert_almost_equal(score, 4.0 / np.sqrt(12.0 * 6.0))

    # Perfect match but where the label names changed
    perfect_score = fowlkes_mallows_score([0, 0, 0, 1, 1, 1], [1, 1, 1, 0, 0, 0])
    assert_almost_equal(perfect_score, 1.0)

    # Worst case
    worst_score = fowlkes_mallows_score([0, 0, 0, 0, 0, 0], [0, 1, 2, 3, 4, 5])
    assert_almost_equal(worst_score, 0.0)

示例#4

0

显示文件

def computeFowklesMallowsIndex(inputDirTDA, inputFileWavelet,
                               sheetNameWavelet):
    #labels_TDA = ['m59','m39','m102', 'm6', 'm47', 'm8', 'm4', 'm98', 'm2', 'm40', 'm3']
    #clustering_TDA, labelsTDA = tdaClustering("../Results/CohomologyOPPregJNP/")
    clustering_TDA, labelsTDA = tdaClustering(inputDirTDA)
    #labels_Wavelet = ['m39','m40', 'm47', 'm98', 'm102', 'm2', 'm3', 'm4', 'm6', 'm8', 'm59']
    clustering_Wavelet, labelsWavelet = waveletClustering(
        inputFileWavelet, sheetNameWavelet)
    # 167
    #[2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 3] -- cutoff 175
    #[2, 2, 1, 1, 1, 1, 4, 4, 3, 3, 5] -- cutoff 160
    #[2, 2, 1, 1, 1, 1, 3, 3, 3, 3, 4] -- cutoff 167

    #Reorganize labels of wavelet clustering according to TDA
    #label so that they are in same order

    reOrganizedWavelet = []
    dicWavelet = {}
    for pair in zip(labelsWavelet, clustering_Wavelet):
        dicWavelet[pair[0]] = pair[1]

    for l in labelsTDA:
        reOrganizedWavelet.append(dicWavelet[l])

    print(reOrganizedWavelet)

    score = fowlkes_mallows_score(clustering_TDA, reOrganizedWavelet)

    #score = fowlkes_mallows_score([1,1,0,0], [0,0,1,1])
    return score

示例#5

0

显示文件

文件： cluster.py 项目： AspirinCode/DeepChEmbed

    def true_label_metrics(true_label, assigned_label, print_metric):
        """ https://scikit-learn.org/stable/modules/clustering.html#clustering-evaluation"""
        true_label_metrics = {}
        true_label_metrics['adjusted_rand_score'] = \
            cluster_metric.adjusted_rand_score(true_label, assigned_label)
        # true_label_metrics['adjusted_mutual_info_score'] = \
        #     cluster_metric.adjusted_mutual_info_score(true_label,
        #                                               assigned_label)
        # true_label_metrics['homogeneity_completeness_v_measure'] = \
        #     cluster_metric.homogeneity_completeness_v_measure(true_label,
        #                                                       assigned_label)
        true_label_metrics['fowlkes_mallows_score'] = \
            cluster_metric.fowlkes_mallows_score(true_label, assigned_label)

        if (print_metric):
            print("Metric with True label")
            print("adjusted rand score: % s " %
                  true_label_metrics['adjusted_rand_score'])
            # print("adjusted mutual info score: % s"
            #       % true_label_metrics['adjusted_mutual_info_score'])
            # print("homogeneity completeness v measure:" )
            # print(true_label_metrics['homogeneity_completeness_v_measure'])
            print("fowlkes_mallows : % s" %
                  true_label_metrics['fowlkes_mallows_score'])

        return true_label_metrics

示例#6

0

显示文件

def get_clustering_metrics(train_data,
                           cluster_labels,
                           ground_truth_labels=None):
    clustering_metric_dict = dict({})
    clustering_metric_dict['silhouette_score'] = silhouette_score(
        train_data, cluster_labels, random_state=42)
    clustering_metric_dict[
        'calinski_harabasz_score'] = calinski_harabasz_score(
            train_data, cluster_labels)
    clustering_metric_dict['davies_bouldin_score'] = davies_bouldin_score(
        train_data, cluster_labels)

    if ground_truth_labels is not None:
        clustering_metric_dict['v_measure_score'] = v_measure_score(
            ground_truth_labels, cluster_labels)
        clustering_metric_dict[
            'fowlkes_mallows_score'] = fowlkes_mallows_score(
                ground_truth_labels, cluster_labels)
        clustering_metric_dict['homogeneity_score'] = homogeneity_score(
            ground_truth_labels, cluster_labels)
        clustering_metric_dict[
            'normalized_mutual_info_score'] = normalized_mutual_info_score(
                ground_truth_labels, cluster_labels)
        clustering_metric_dict['adjusted_rand_score'] = adjusted_rand_score(
            ground_truth_labels, cluster_labels)
        clustering_metric_dict['completeness_score'] = completeness_score(
            ground_truth_labels, cluster_labels)

    return clustering_metric_dict

示例#7

0

显示文件

	def _score_clustering(self, labels,metric='vm'):
		# Score clustering compared to true model
		if metric=='fm':
			score = fowlkes_mallows_score(self.true_labels_, labels)
		elif metric=='ami':
			score = adjusted_mutual_info_score(self.true_labels_, labels)
		else:
			score = v_measure_score(self.true_labels_[labels>0], labels[labels>0])
		return score

示例#8

0

显示文件

def _clustering_evaluation(label, labels_true, digits):
    if labels_true is None:
        FM = None
        ARI = None
    else:
        ARI = round(adjusted_rand_score(labels_true, label), digits)
        FM = round(fowlkes_mallows_score(labels_true, label),digits)

    return ARI, FM

示例#9

0

显示文件

文件： landmark_mf.py 项目： ItayGabbay/ClusteringAlgorithmSelection

def get_landmarking(dataset_name, df):
    start = time.time()
    record = {'dataset': dataset_name.split('.')[0]}
    results = []
    n_samples = int(len(df)*0.1) if len(df) > 400 else min(df.shape[0], 40)
    data = df.sample(n=n_samples, replace=False)
    labels = get_dbscan(data)
    k = len(np.unique(labels))
    labels2 = get_Kmeans(data, k, 40)
    full_tree = DecisionTreeClassifier()
    full_tree.fit(data, labels)
    worst_attr = np.argmin(full_tree.feature_importances_)

    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.3)
    best_stump = DecisionTreeClassifier(max_depth=1)
    random_stump = DecisionTreeClassifier(splitter="random", max_depth=1)
    worst_stump = DecisionTreeClassifier(max_depth=1)
    elite_knn = KNeighborsClassifier(n_neighbors=1)
    one_knn = KNeighborsClassifier(n_neighbors=1,
            algorithm="auto",
            weights="uniform",
            p=2,
            metric="minkowski")
    nb = GaussianNB()
    lda = LinearDiscriminantAnalysis()
    best_stump.fit(X_train, y_train)
    random_stump.fit(X_train, y_train)
    worst_stump.fit(X_train.iloc[:, worst_attr].values.reshape(-1, 1), y_train)
    elite_knn.fit(X_train, y_train)
    one_knn.fit(X_train, y_train)
    # lda.fit(X_train, y_train)
    nb.fit(X_train, y_train)

    record['LM1'] = np.log2(df.shape[0])
    record['LM2'] = np.log2(df.shape[1])
    record['LM3'] = accuracy_score(best_stump.predict(X_test), y_test)
    # record['LM4'] = f1_score(best_stump.predict(X_test), y_test, average='weighted')
    record['LM5'] = accuracy_score(random_stump.predict(X_test), y_test)
    # record['LM6'] = f1_score(random_stump.predict(X_test), y_test, average='weighted')
    # record['LM7'] = model.inertia_
    record['LM8'] = accuracy_score(elite_knn.predict(X_test), y_test)
    # record['LM9'] = f1_score(elite_knn.predict(X_test), y_test, average='weighted')
    # record['LM10'] = accuracy_score(lda.predict(X_test), y_test)
    # record['LM11'] = f1_score(lda.predict(X_test), y_test, average='weighted')
    record['LM12'] = accuracy_score(nb.predict(X_test), y_test)
    # record['LM13'] = f1_score(nb.predict(X_test), y_test, average='weighted')
    record['LM14'] = accuracy_score(one_knn.predict(X_test), y_test)
    # record['LM15'] = f1_score(one_knn.predict(X_test), y_test, average='weighted')
    record['LM16'] = accuracy_score(worst_stump.predict(X_test.iloc[:, worst_attr].values.reshape(-1, 1)), y_test)
    # record['LM17'] = f1_score(worst_stump.predict(X_test.iloc[:, worst_attr].values.reshape(-1, 1)), y_test, average='weighted')
    record['LM18'] = adjusted_rand_score(labels, labels2)
    record['LM19'] = adjusted_mutual_info_score(labels, labels2)
    record['LM20'] = completeness_score(labels, labels2)
    record['LM21'] = fowlkes_mallows_score(labels, labels2)

    end = time.time()
    return record, (df.shape[0], df.shape[1], end-start)

示例#10

0

显示文件

文件： test_supervised.py 项目： MartinThoma/scikit-learn

def test_int_overflow_mutual_info_fowlkes_mallows_score():
    # Test overflow in mutual_info_classif and fowlkes_mallows_score
    x = np.array([1] * (52632 + 2529) + [2] * (14660 + 793) + [3] * (3271 +
                 204) + [4] * (814 + 39) + [5] * (316 + 20))
    y = np.array([0] * 52632 + [1] * 2529 + [0] * 14660 + [1] * 793 +
                 [0] * 3271 + [1] * 204 + [0] * 814 + [1] * 39 + [0] * 316 +
                 [1] * 20)

    assert_all_finite(mutual_info_score(x, y))
    assert_all_finite(fowlkes_mallows_score(x, y))

示例#11

0

显示文件

def test_int_overflow_mutual_info_fowlkes_mallows_score():
    # Test overflow in mutual_info_classif and fowlkes_mallows_score
    x = np.array([1] * (52632 + 2529) + [2] * (14660 + 793) + [3] *
                 (3271 + 204) + [4] * (814 + 39) + [5] * (316 + 20))
    y = np.array([0] * 52632 + [1] * 2529 + [0] * 14660 + [1] * 793 +
                 [0] * 3271 + [1] * 204 + [0] * 814 + [1] * 39 + [0] * 316 +
                 [1] * 20)

    assert_all_finite(mutual_info_score(x, y))
    assert_all_finite(fowlkes_mallows_score(x, y))

示例#12

0

显示文件

文件： sklearn_cluster_performance.py 项目： ly-joy/NJU-Machine-Learning

def cluster_performance(y_true, y_pred):
    '''
    返回FM指数和Rand指数
    :param y_true:参考模型的簇划分，类型为ndarray
    :param y_pred:聚类模型给出的簇划分，类型为ndarray
    :return: FM指数，Rand指数
    '''
    FM = fowlkes_mallows_score(y_true, y_pred)
    Rand = adjusted_rand_score(y_true, y_pred)
    return FM, Rand

示例#13

0

显示文件

def cluster_performance(y_true, y_pred):
    '''
    返回FM指数和Rand指数
    :param y_true:参考模型的簇划分，类型为ndarray
    :param y_pred:聚类模型给出的簇划分，类型为ndarray
    :return: FM指数，Rand指数
    '''

    #********* Begin *********#
    return fowlkes_mallows_score(y_true,
                                 y_pred), adjusted_rand_score(y_true, y_pred)

示例#14

0

显示文件

def print_stats(x, y, quiet=True):
    ari = adjusted_rand_score(x, y)
    ami = adjusted_mutual_info_score(x, y)
    fms = fowlkes_mallows_score(x, y)

    if not quiet:
        print("ARI: {}".format(ari), file=sys.stderr)
        print("AMI: {}".format(ami), file=sys.stderr)
        print("FMS: {}".format(fms), file=sys.stderr)

    return ari, ami, fms

示例#15

0

显示文件

def cluster_performance(y_true, y_pred):
    """
    返回Rand指数和FM指数
    :param y_true:参考模型的簇划分，类型为ndarray
    :param y_pred:聚类模型给出的簇划分，类型为ndarray
    :return: Rand指数，FM指数
    """
    # ********* Begin *********#
    rand = adjusted_rand_score(y_true, y_pred)
    fm = fowlkes_mallows_score(y_true, y_pred)
    return fm, rand

示例#16

0

显示文件

def test_fowlkes_mallows_score_properties():
    # handcrafted example
    labels_a = np.array([0, 0, 0, 1, 1, 2])
    labels_b = np.array([1, 1, 2, 2, 0, 0])
    expected = 1. / np.sqrt((1. + 3.) * (1. + 2.))
    # FMI = TP / sqrt((TP + FP) * (TP + FN))

    score_original = fowlkes_mallows_score(labels_a, labels_b)
    assert_almost_equal(score_original, expected)

    # symmetric property
    score_symmetric = fowlkes_mallows_score(labels_b, labels_a)
    assert_almost_equal(score_symmetric, expected)

    # permutation property
    score_permuted = fowlkes_mallows_score((labels_a + 1) % 3, labels_b)
    assert_almost_equal(score_permuted, expected)

    # symmetric and permutation(both together)
    score_both = fowlkes_mallows_score(labels_b, (labels_a + 2) % 3)
    assert_almost_equal(score_both, expected)

示例#17

0

显示文件

文件： test_supervised.py 项目： MartinThoma/scikit-learn

def test_fowlkes_mallows_score_properties():
    # handcrafted example
    labels_a = np.array([0, 0, 0, 1, 1, 2])
    labels_b = np.array([1, 1, 2, 2, 0, 0])
    expected = 1. / np.sqrt((1. + 3.) * (1. + 2.))
    # FMI = TP / sqrt((TP + FP) * (TP + FN))

    score_original = fowlkes_mallows_score(labels_a, labels_b)
    assert_almost_equal(score_original, expected)

    # symmetric property
    score_symmetric = fowlkes_mallows_score(labels_b, labels_a)
    assert_almost_equal(score_symmetric, expected)

    # permutation property
    score_permuted = fowlkes_mallows_score((labels_a + 1) % 3, labels_b)
    assert_almost_equal(score_permuted, expected)

    # symmetric and permutation(both together)
    score_both = fowlkes_mallows_score(labels_b, (labels_a + 2) % 3)
    assert_almost_equal(score_both, expected)

示例#18

0

显示文件

文件： porownanie.py 项目： aleksandramiesiac/pd3

def Porownaj_algorytmy2(data, klasy, labels, baza):
    """
    Oblicza indeksy AM, AR i FM dla algorytmu napisanego przeze mnie.
    """
    wektor =[]
    
    #moj algorytm
    wynikM = spectral_clustering(data, k=klasy, M=5)
    wektor.append([fowlkes_mallows_score(labels,wynikM), adjusted_mutual_info_score(labels, wynikM),adjusted_rand_score(labels,wynikM),baza ])
    
    index=["Moj"]
    
    return pd.DataFrame(wektor, index = index, columns = ["FM","AM","AR", "Dane"])

示例#19

0

显示文件

文件： EvaluateClustering.py 项目： zxchen110/SEG-BERT

 def evaluate(self):
     eval_result_dict = {}
     eval_result_dict['ami'] = adjusted_mutual_info_score(
         self.data['true_y'], self.data['pred_y'])
     eval_result_dict['rand'] = adjusted_rand_score(self.data['true_y'],
                                                    self.data['pred_y'])
     eval_result_dict['comp'] = completeness_score(self.data['true_y'],
                                                   self.data['pred_y'])
     eval_result_dict['fow'] = fowlkes_mallows_score(
         self.data['true_y'], self.data['pred_y'])
     eval_result_dict['hom'] = homogeneity_score(self.data['true_y'],
                                                 self.data['pred_y'])
     eval_result_dict['nmi'] = normalized_mutual_info_score(
         self.data['true_y'], self.data['pred_y'])
     eval_result_dict['v_score'] = v_measure_score(self.data['true_y'],
                                                   self.data['pred_y'])
     return eval_result_dict

示例#20

0

显示文件

def cluster_hac(num_k):
    feature_ds, label_ds = read_dataset()

    user_max_id = num_k - 1
    sub_feature_ds = []
    sub_label_ds = []
    for i in range(0, len(label_ds)):
        if label_ds[i] <= user_max_id:
            sub_feature_ds.append(feature_ds[i])
            sub_label_ds.append(label_ds[i])

    feature_array = np.array(sub_feature_ds)

    x_scalar = StandardScaler()
    x = x_scalar.fit_transform(feature_array)

    pca = PCA(n_components=0.999)
    components = pca.fit_transform(x)
    hac = AgglomerativeClustering(n_clusters=num_k, linkage='average')
    hac.fit_predict(components)
    print(fowlkes_mallows_score(hac.labels_, sub_label_ds))

示例#21

0

显示文件

文件： kmeans_test.py 项目： EnderCheng/YelpLink

def cluster_kmeans(num_k):
    feature_ds, label_ds = read_dataset()

    user_max_id = num_k - 1
    sub_feature_ds = []
    sub_label_ds = []
    for i in range(0, len(label_ds)):
        if label_ds[i] <= user_max_id:
            sub_feature_ds.append(feature_ds[i])
            sub_label_ds.append(label_ds[i])

    feature_array = np.array(sub_feature_ds)

    x_scalar = StandardScaler()
    x = x_scalar.fit_transform(feature_array)

    pca = PCA(n_components=0.999)
    components = pca.fit_transform(x)
    kmeans = KMeans(n_clusters=num_k, random_state=0)
    kmeans.fit_predict(components)
    print(fowlkes_mallows_score(kmeans.labels_, sub_label_ds))

示例#22

0

显示文件

def compute_external_metrics(labels_true: List[str],
                             labels_pred: List[int]) -> ExternalEvaluation:
    if len(labels_true) == 0 and len(labels_pred) == 0:
        return None

    homogeneity, completeness, v_measure = homogeneity_completeness_v_measure(
        labels_true, labels_pred)
    adjusted_mutual_info = adjusted_mutual_info_score(labels_true, labels_pred)
    adjusted_rand_index = adjusted_rand_score(labels_true, labels_pred)
    fowlkes_mallows = fowlkes_mallows_score(labels_true, labels_pred)

    mat = contingency_matrix(labels_true, labels_pred)
    purity = purity_score(mat)
    inverse_purity = purity_score(mat, inverse=True)

    return ExternalEvaluation(homogeneity=homogeneity,
                              completeness=completeness,
                              v_measure=v_measure,
                              adjusted_mutual_information=adjusted_mutual_info,
                              adjusted_rand_index=adjusted_rand_index,
                              fowlkes_mallows=fowlkes_mallows,
                              purity=purity,
                              inverse_purity=inverse_purity)

示例#23

0

显示文件

文件： func.py 项目： swqs1989/ds598

 def externalEval(self, y_pred, true_label):
     true_label = np.array(true_label)
     n_cluster = len(set(true_label))
     y_pred_modi = y_pred.copy()
     result = [[] for i in range(len(set(y_pred)))]
     for i in range(len(y_pred)):
         result[y_pred[i]].append(i)
     dict1 = dict.fromkeys([i for i in range(n_cluster)], None)
     for i in list(dict1.keys()):
         dict1[i] = []
     nummostnum = 0
     for i in range(len(result)):
         if len(true_label[result[i]]) > 0:
             mostnum = Counter(true_label[result[i]]).most_common(1)[0][0]
             nummostnum += Counter(
                 true_label[result[i]]).most_common(1)[0][1]
             dict1[mostnum] += (result[i])
     for r in list(dict1.keys()):
         for i in dict1[r]:
             y_pred_modi[i] = r
     nmi = normalized_mutual_info_score(true_label, y_pred)
     purity = nummostnum / len(y_pred_modi)
     fowlkes_mallows = fowlkes_mallows_score(true_label, y_pred_modi)
     return nmi, purity, fowlkes_mallows

示例#24

0

显示文件

文件： k_means.py 项目： shubhampachori12110095/Machine_Learning

def kmeans(data, k):

    centroid = initialize_centroids(data, k)
    a = np.zeros((k, k))
    b = np.zeros(k)
    c1 = np.zeros((k, k))
    d = np.zeros(k)
    clusnew = np.zeros(len(data))
    i = 1
    while (i < 100):
        clusters = closest_centroid(data, centroid, k)
        for l in range(0, k):
            centroid[l, :] = np.mean(data[(np.where(clusters == l))], axis=0)
        i = i + 1
        print i
    c = confusion_matrix(clusters, digits.target)
    for j in range(0, k):
        c1[j, :] = c[:, (np.argmax(c[j, :]))]
        clusnew[clusters == (np.argmax(c[j, :]))] = j
        d[j] = sum(c1[:, j])
    c1[:, (np.argmin(d))] = -1
    print('Confusion Matrix: ', c1)
    print('Fowlkes Mallows Score: ',
          fowlkes_mallows_score(digits.target, clusnew))

示例#25

0

显示文件

文件： eval.py 项目： ChahatBansal8060/2018ANZ8060_HW2

output_file = sys.argv[1]
correct_file = sys.argv[2]

values = np.loadtxt(correct_file, dtype=int)
num_lines = sum(1 for line in open(correct_file))
result = np.zeros(num_lines)

cur_clus = -1

with open(output_file) as f:
    content = f.readlines()

for i in range(0, len(content)):
    if (content[i][0] == '#'):
        cur_clus += 1
        continue

    result[int(content[i])] = cur_clus

net_score = fowlkes_mallows_score(values, result)
'''
unique, counts = np.unique(result, return_counts=True)
print np.asarray((unique, counts)).T

unique, counts = np.unique(values, return_counts=True)
print np.asarray((unique, counts)).T
'''
logs = open('DBSCANLogs.txt', 'a')
logs.write(str(net_score) + '\n')
logs.close()

示例#26

0

显示文件

文件： clustering.py 项目： rtrad89/authorship_clustering_code_repo

    def _eval_clustering(self, labels_true, labels_predicted):
        # To address when COP-KMeans fails to satisfy all constraints at a k:
        if labels_predicted is None:
            # return an empty dictionary to expose in the final output
            return {"nmi": None,
                    "ami": None,
                    "ari": None,
                    "fms": None,
                    "v_measure": None,
                    "bcubed_precision": None,
                    "bcubed_recall": None,
                    "bcubed_fscore": None,
                    "Silhouette": None,
                    "Calinski_harabasz": None,
                    "Davies_Bouldin": None
                    }

        nmi = normalized_mutual_info_score(labels_true,
                                           labels_predicted,
                                           average_method="max")

        ami = adjusted_mutual_info_score(labels_true,
                                         labels_predicted,
                                         average_method="arithmetic")

        ari = adjusted_rand_score(labels_true,
                                  labels_predicted)

        v_measure = v_measure_score(labels_true,
                                    labels_predicted,
                                    beta=1.0)

        fms = fowlkes_mallows_score(labels_true,
                                    labels_predicted)

        # Reshape labels for BCubed measures
        true_dict = self._reshape_labels_as_dicts(labels_true)
        pred_dict = self._reshape_labels_as_dicts(labels_predicted)

        bcubed_precision = bcubed.precision(cdict=pred_dict, ldict=true_dict)
        bcubed_recall = bcubed.recall(cdict=pred_dict, ldict=true_dict)
        bcubed_f1 = bcubed.fscore(bcubed_precision, bcubed_recall)

        # =====================================================================
        # Unsupervised Metrics
        # =====================================================================
        if not labels_predicted.nunique() in (1, len(self.data)):
            sil = silhouette_score(X=self.data,
                                   labels=labels_predicted,
                                   metric=self.distance_metric,
                                   random_state=13712)

            ch = calinski_harabasz_score(X=self.data, labels=labels_predicted)

            dv = davies_bouldin_score(X=self.data, labels=labels_predicted)
        else:
            sil = None
            ch = None
            dv = None

        ret = {}
        ret.update({"nmi": round(nmi, 4),
                    "ami": round(ami, 4),
                    "ari": round(ari, 4),
                    "fms": round(fms, 4),
                    "v_measure": round(v_measure, 4),
                    "bcubed_precision": round(bcubed_precision, 4),
                    "bcubed_recall": round(bcubed_recall, 4),
                    "bcubed_fscore": round(bcubed_f1, 4),
                    "Silhouette": round(sil, 4
                                        ) if sil is not None else None,
                    "Calinski_harabasz": round(ch, 4
                                               ) if ch is not None else None,
                    "Davies_Bouldin": round(dv, 4
                                            ) if dv is not None else None
                    # Here goes the unsupervised indices
                    })

        return ret

示例#27

0

显示文件

def _fm(labels, labels_true,digits):
    return round(fowlkes_mallows_score(labels_true, labels),digits)

示例#28

0

显示文件

data_copy = copy.copy(data)

# Drop the class
inputs = data.drop('species', axis=1)

# Test from n_clusters = 2 until n_clusters = 6
for n_clusters in range(2, 6 + 1):
    # Fowkes-Mallows and Silhouette evaluation:
    agglo = Agglomerative(n_clusters=n_clusters)
    agglo.fit(inputs)
    labels = np.array(agglo.predict(inputs))

    print("n_clusters =", n_clusters)

    print("Menggunakan metode Fowlkes-Mallows: ")
    fowlkes_mallows = fowlkes_mallows_score(labels, target)
    print("Fowlkes Mallows Score:", fowlkes_mallows)

    print("Menggunakan metode Silhouette:")
    silhouette_avg = silhouette_score(inputs, labels)
    print("Hasil rata-rata skor silhouette:", silhouette_avg)
    print()
    print()

    silhouette_values_per_point = silhouette_samples(inputs, labels)

    # Visualize Silhouette subplot
    # 1 row and 2 columns: Left -> silhouette plot and Right -> Cluster Visualization
    fig, silhouette_viz = plt.subplots(1)
    fig.set_size_inches(18, 7)

示例#29

0

显示文件

def report_clustering(distance_file,
                      biom_file,
                      metadata_file,
                      num_clusters,
                      verbose,
                      L=2,
                      output_file=None):
    if not isinstance(distance_file, list):
        distance_matrix = CSV.read(distance_file)
    else:
        distance_matrix = distance_file

    if output_file is not None:
        f = open(output_file, 'w')

    output_matrix = []

    AgglomerativeCluster = AgglomerativeClustering(
        n_clusters=num_clusters, affinity='precomputed',
        linkage='complete').fit_predict(distance_matrix)
    KMedoidsCluster = KMedoids(n_clusters=num_clusters,
                               metric='precomputed',
                               method='pam',
                               init='heuristic').fit_predict(distance_matrix)

    PCoA_Samples = BW.extract_samples(biom_file)
    metadata = meta.extract_metadata(metadata_file)
    region_names = []
    for i in range(len(PCoA_Samples)):
        if metadata[PCoA_Samples[i]]['body_site'] not in region_names:
            region_names.append(metadata[PCoA_Samples[i]]['body_site'])
        PCoA_Samples[i] = region_names.index(
            metadata[PCoA_Samples[i]]['body_site'])

    if verbose and L == 1:
        print('Printing results for L1-UniFrac:')
    elif verbose and L == 2:
        print('Printing results for L2-UniFrac:')
    if verbose:
        print('Metric\t\t\t\t\t\t\tAgglomerativeClustering\t\tKMedoids')

    if output_file is not None:
        if L == 1:
            f.write('Printing results for L1-UniFrac:\n')
        elif L == 2:
            f.write('Printing results for L2-UniFrac:\n')
        f.write('Metric\t\t\t\tAgglomerativeClustering\t\t\tKMedoids\n')

    if L == 1:
        output_matrix.append(['Printing results for L1-UniFrac:'])
    if L == 2:
        output_matrix.append(['Printing results for L2-UniFrac:'])
    output_matrix.append(['Metric', 'AgglomerativeClustering', 'KMedoids'])

    RI1 = rand_score(PCoA_Samples, AgglomerativeCluster)
    RI2 = rand_score(PCoA_Samples, KMedoidsCluster)
    if verbose:
        print(f'Rand Index Score:               {RI1}\t\t\t{RI2}')
    ARI1 = adjusted_rand_score(PCoA_Samples, AgglomerativeCluster)
    ARI2 = adjusted_rand_score(PCoA_Samples, KMedoidsCluster)
    if verbose:
        print(f'Adjusted Rand Index Score:      {ARI1}\t\t\t{ARI2}')
    NMI1 = normalized_mutual_info_score(PCoA_Samples, AgglomerativeCluster)
    NMI2 = normalized_mutual_info_score(PCoA_Samples, KMedoidsCluster)
    if verbose:
        print(f'Normalized Mutual Index Score:  {NMI1}\t\t\t{NMI2}')
    AMI1 = adjusted_mutual_info_score(PCoA_Samples, AgglomerativeCluster)
    AMI2 = adjusted_mutual_info_score(PCoA_Samples, KMedoidsCluster)
    if verbose:
        print(f'Adjusted Mutual Info Score:     {AMI1}\t\t\t{AMI2}')
    FM1 = fowlkes_mallows_score(PCoA_Samples, AgglomerativeCluster)
    FM2 = fowlkes_mallows_score(PCoA_Samples, KMedoidsCluster)
    if verbose:
        print(f'Fowlkes Mallows Score:          {FM1}\t\t\t{FM2}')

    if output_file is not None:
        f.write(f'Rand Index Score:               {RI1}\t\t\t{RI2}\n')
        f.write(f'Adjusted Rand Index Score:      {ARI1}\t\t\t{ARI2}\n')
        f.write(f'Normalized Mutual Index Score:  {NMI1}\t\t\t{NMI2}\n')
        f.write(f'Adjusted Mutual Info Score:     {AMI1}\t\t\t{AMI2}\n')
        f.write(f'Fowlkes Mallows Score:          {FM1}\t\t\t{FM2}\n')

    output_matrix.append(['Rand Index Score:', RI1, RI2])
    output_matrix.append(['Adjusted Rand Index Score:', ARI1, ARI2])
    output_matrix.append(['Normalized Mutual Index Score:', NMI1, NMI2])
    output_matrix.append(['Adjusted Mutual Info Score:', AMI1, AMI2])
    output_matrix.append(['Fowlkes Mallows Score:', FM1, FM2])

    return output_matrix

示例#30

0

显示文件

from time import time
import numpy as np
from scipy import ndimage
from matplotlib import pyplot as plt
from sklearn import manifold, datasets
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import confusion_matrix
from sklearn.metrics.cluster import fowlkes_mallows_score
from sklearn.preprocessing import scale
digits = datasets.load_digits(n_class=10)
X = scale(digits.data)
y = digits.target
n_samples, n_features = X.shape
X_red = manifold.SpectralEmbedding(n_components=2).fit_transform(X)
clusnew = np.zeros(len(X))
clustering = AgglomerativeClustering(linkage='ward', n_clusters=10)
t0 = time()
clustering.fit(X_red)
print("%s : %.2fs" % ('ward', time() - t0))
c1 = np.zeros((10, 10))
d = np.zeros(10)
c = confusion_matrix(clustering.labels_, y)
for j in range(0, 10):
    c1[j, :] = c[:, (np.argmax(c[j, :]))]
    clusnew[clustering.labels_ == (np.argmax(c[j, :]))] = j
    d[j] = sum(c1[:, j])
c1[:, (np.argmin(d))] = -1
print('Confusion Matrix: ', c1)
print('Fowlkes Mallows Score: ', fowlkes_mallows_score(y, clusnew))

示例#31

0

显示文件

with open(fi, 'w') as outfile:
    json.dump(label_test.tolist(), outfile)
fi = os.getcwd() + "/svm/json/label_train.json"
with open(fi, 'w') as outfile:
    json.dump(label_train.tolist(), outfile)

print "\nLinear SVC: "
Classifier = svm.SVC(kernel='linear', probability=True)
Classifier.fit(feature_train, label_train)
joblib.dump(Classifier, 'linear_2.pkl')
print "predicting.."
predict = Classifier.predict(feature_test)
print "Expected output:", label_test
print "Predicted output:", predict
print "Confusion Matrix:\n", metrics.confusion_matrix(label_test, predict)
print "Fowlkes Mallows Score", fowlkes_mallows_score(label_test, predict)

try:
    print "Precision Score", precision_score(label_test, predict)
    print "Recall Score", recall_score(label_test, predict)
    print "F-measure", f1_score(label_test, predict)
    # exit()
except:
    pass

print "\nRBF SVC: "
Classifier = svm.SVC(kernel='rbf')
Classifier.fit(feature_train, label_train)
joblib.dump(Classifier, 'rbf_2.pkl')
print "predicting.."
predict = Classifier.predict(feature_test)

示例#32

0

显示文件

    def compute_scores(self, x):

        self.cluster_labels = np.ndarray((x.shape[0], ))

        for i in range(0, x.shape[0], self.batch_size):
            predictions = self.kmeans.predict(x[i:(i + self.batch_size)])
            self.cluster_labels[i:(i + self.batch_size)] = predictions

        if (i + self.batch_size) > x.shape[0]:
            predictions = self.kmeans.predict(x[i:x.shape[0]])
            self.cluster_labels[i:x.shape[0]] = predictions

        confusion_matrix = cscores.contingency_matrix(self.labels_true,
                                                      self.labels_pred)
        purity_score = np.sum(np.amax(confusion_matrix,
                                      axis=0)) / np.sum(confusion_matrix)
        homogeneity_score, completeness_score, v_measure_score = cscores.homogeneity_completeness_v_measure(
            self.labels_true, self.labels_pred)

        scores = [
            #['calinski_harabasz_score', 'internal', cscores.calinski_harabasz_score(x, self.cluster_labels)],
            [
                'davies_bouldin_score', 'internal',
                metrics.davies_bouldin_score(x, self.cluster_labels)
            ],
            [
                'silhouette_score', 'internal',
                metrics.silhouette_score(x, self.cluster_labels)
            ],
            #['silhouette_samples', 'internal', cscores.silhouette_samples(x, self.cluster_labels)],
            ['purity_score', 'external', purity_score],
            [
                'adjusted_rand_score', 'external',
                cscores.adjusted_rand_score(self.labels_true, self.labels_pred)
            ],
            ['completeness_score', 'external', completeness_score],
            [
                'fowlkes_mallows_score', 'external',
                cscores.fowlkes_mallows_score(self.labels_true,
                                              self.labels_pred)
            ],
            ['homogeneity_score', 'external', homogeneity_score],
            [
                'adjusted_mutual_info_score', 'external',
                cscores.adjusted_mutual_info_score(self.labels_true,
                                                   self.labels_pred)
            ],
            [
                'mutual_info_score', 'external',
                cscores.mutual_info_score(self.labels_true, self.labels_pred)
            ],
            [
                'normalized_mutual_info_score', 'external',
                cscores.normalized_mutual_info_score(self.labels_true,
                                                     self.labels_pred)
            ],
            ['v_measure_score', 'external', v_measure_score]
        ]

        scores = pd.DataFrame(scores, columns=['name', 'type', 'score'])
        scores.to_csv(files.small_images_classes_kmeans_scores, index=False)

示例#33

0

显示文件

	n_samples, n_features = X.shape
	np.random.seed(0)
	k=10
	labels_y = list(set(y))

	print 50*"_"
	print "KMeans clustering (implementation of algo from question 1a)"
	no_of_iterations = 10
	dat = {i:0 for i in range(n_samples)} 
	t0 = time()
	k_centers, dat = k_means(X, dat, k,no_of_iterations) 
	y_pred1 = [value for key,value in dat.iteritems()]
	c_m1 = confusion_matrix(y,y_pred1, labels_y)	
	print "PROTOCOL1: The cluster predictions for 10 clusters, i.e k = 10 are:\n",getClusterRepresentatives(c_m1, k)
	print "PROTOCOL2: Confusion Matrix: \n",c_m1
	print "PROTOCOL3: Fowlkes-Mallows score:", fowlkes_mallows_score(y, y_pred1)
	print "Time taken: %.2fs" % (time() - t0)

	print 50*"_"
	print "KMeans clustering (using sklearn)"
	clustering1 = KMeans(n_clusters=k, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances='auto', verbose=0, random_state=None, copy_x=True, n_jobs=1, algorithm='auto')
	t01 = time()
	y_pred11 = clustering1.fit_predict(X)
	c_m11 = confusion_matrix(y,y_pred11)	
	print "PROTOCOL1: The cluster predictions for 10 clusters, i.e k = 10 are:\n",getClusterRepresentatives(c_m11, k)
	print "PROTOCOL2: Confusion Matrix: \n",c_m11
	print "PROTOCOL3: Fowlkes-Mallows score:", fowlkes_mallows_score(y, y_pred11)
	print "Time taken: %.2fs" % (time() - t01)

	print 50*"_"
	print "Agglomerative Clustering with Ward linkage"