示例#1
0
def Porownaj_algorytmy(data, klasy, labels, method, baza):
    """
    Oblicza indeksy AM, AR i FM dla wszystkich algorytmów, aprócz napisanego przeze mnie.
    """

    wektor =[]
    test =[0]*len(method)
    i=0
    #algorytmy linkage
    for name in method:
        Z = linkage(data, name)
        test[i] = cluster.hierarchy.cut_tree(Z,klasy)
        test[i] = [y for x in test[i] for y in x]
        wektor.append([fowlkes_mallows_score(labels,test[i]), adjusted_mutual_info_score(labels, test[i]),adjusted_rand_score(labels,test[i]),baza ])
        i+=1
    # algorytm genieclust
    wynikMG = genieclust.genie.Genie(n_clusters=klasy).fit_predict(data)
    wektor.append([fowlkes_mallows_score(labels,wynikMG), adjusted_mutual_info_score(labels, wynikMG),adjusted_rand_score(labels,wynikMG),baza ])
    
    #MeanShift
    wynikCL = MeanShift(bandwidth=klasy).fit(data).labels_
    wektor.append([fowlkes_mallows_score(labels,wynikCL), adjusted_mutual_info_score(labels, wynikCL),adjusted_rand_score(labels,wynikCL),baza ])
    
    #AgglomerativeClustering
    wynikFA = AgglomerativeClustering(n_clusters=klasy).fit(data).labels_
    wektor.append([fowlkes_mallows_score(labels,wynikFA), adjusted_mutual_info_score(labels, wynikFA),adjusted_rand_score(labels,wynikFA),baza ])
    
    #KMeans
    wynikKM = KMeans(n_clusters=klasy, random_state=123).fit(data).labels_
    wektor.append([fowlkes_mallows_score(labels,wynikKM), adjusted_mutual_info_score(labels, wynikKM),adjusted_rand_score(labels,wynikKM) ,baza])
    
    
    index = ["single",'complete','average','weighted','centroid','median','ward', "genieclust","AgglomerativeClustering","KMeans","MeanShift"]
    
    return pd.DataFrame(wektor, index = index, columns = ["FM","AM","AR", "Dane"])
def test_fowlkes_mallows_score():
    # General case
    score = fowlkes_mallows_score([0, 0, 0, 1, 1, 1], [0, 0, 1, 1, 2, 2])
    assert_almost_equal(score, 4.0 / np.sqrt(12.0 * 6.0))

    # Perfect match but where the label names changed
    perfect_score = fowlkes_mallows_score([0, 0, 0, 1, 1, 1], [1, 1, 1, 0, 0, 0])
    assert_almost_equal(perfect_score, 1.0)

    # Worst case
    worst_score = fowlkes_mallows_score([0, 0, 0, 0, 0, 0], [0, 1, 2, 3, 4, 5])
    assert_almost_equal(worst_score, 0.0)
示例#3
0
def test_fowlkes_mallows_score():
    # General case
    score = fowlkes_mallows_score([0, 0, 0, 1, 1, 1], [0, 0, 1, 1, 2, 2])
    assert_almost_equal(score, 4.0 / np.sqrt(12.0 * 6.0))

    # Perfect match but where the label names changed
    perfect_score = fowlkes_mallows_score([0, 0, 0, 1, 1, 1], [1, 1, 1, 0, 0, 0])
    assert_almost_equal(perfect_score, 1.0)

    # Worst case
    worst_score = fowlkes_mallows_score([0, 0, 0, 0, 0, 0], [0, 1, 2, 3, 4, 5])
    assert_almost_equal(worst_score, 0.0)
示例#4
0
def computeFowklesMallowsIndex(inputDirTDA, inputFileWavelet,
                               sheetNameWavelet):
    #labels_TDA = ['m59','m39','m102', 'm6', 'm47', 'm8', 'm4', 'm98', 'm2', 'm40', 'm3']
    #clustering_TDA, labelsTDA = tdaClustering("../Results/CohomologyOPPregJNP/")
    clustering_TDA, labelsTDA = tdaClustering(inputDirTDA)
    #labels_Wavelet = ['m39','m40', 'm47', 'm98', 'm102', 'm2', 'm3', 'm4', 'm6', 'm8', 'm59']
    clustering_Wavelet, labelsWavelet = waveletClustering(
        inputFileWavelet, sheetNameWavelet)
    # 167
    #[2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 3] -- cutoff 175
    #[2, 2, 1, 1, 1, 1, 4, 4, 3, 3, 5] -- cutoff 160
    #[2, 2, 1, 1, 1, 1, 3, 3, 3, 3, 4] -- cutoff 167

    #Reorganize labels of wavelet clustering according to TDA
    #label so that they are in same order

    reOrganizedWavelet = []
    dicWavelet = {}
    for pair in zip(labelsWavelet, clustering_Wavelet):
        dicWavelet[pair[0]] = pair[1]

    for l in labelsTDA:
        reOrganizedWavelet.append(dicWavelet[l])

    print(reOrganizedWavelet)

    score = fowlkes_mallows_score(clustering_TDA, reOrganizedWavelet)

    #score = fowlkes_mallows_score([1,1,0,0], [0,0,1,1])
    return score
示例#5
0
    def true_label_metrics(true_label, assigned_label, print_metric):
        """ https://scikit-learn.org/stable/modules/clustering.html#clustering-evaluation"""
        true_label_metrics = {}
        true_label_metrics['adjusted_rand_score'] = \
            cluster_metric.adjusted_rand_score(true_label, assigned_label)
        # true_label_metrics['adjusted_mutual_info_score'] = \
        #     cluster_metric.adjusted_mutual_info_score(true_label,
        #                                               assigned_label)
        # true_label_metrics['homogeneity_completeness_v_measure'] = \
        #     cluster_metric.homogeneity_completeness_v_measure(true_label,
        #                                                       assigned_label)
        true_label_metrics['fowlkes_mallows_score'] = \
            cluster_metric.fowlkes_mallows_score(true_label, assigned_label)

        if (print_metric):
            print("Metric with True label")
            print("adjusted rand score: % s " %
                  true_label_metrics['adjusted_rand_score'])
            # print("adjusted mutual info score: % s"
            #       % true_label_metrics['adjusted_mutual_info_score'])
            # print("homogeneity completeness v measure:" )
            # print(true_label_metrics['homogeneity_completeness_v_measure'])
            print("fowlkes_mallows : % s" %
                  true_label_metrics['fowlkes_mallows_score'])

        return true_label_metrics
示例#6
0
def get_clustering_metrics(train_data,
                           cluster_labels,
                           ground_truth_labels=None):
    clustering_metric_dict = dict({})
    clustering_metric_dict['silhouette_score'] = silhouette_score(
        train_data, cluster_labels, random_state=42)
    clustering_metric_dict[
        'calinski_harabasz_score'] = calinski_harabasz_score(
            train_data, cluster_labels)
    clustering_metric_dict['davies_bouldin_score'] = davies_bouldin_score(
        train_data, cluster_labels)

    if ground_truth_labels is not None:
        clustering_metric_dict['v_measure_score'] = v_measure_score(
            ground_truth_labels, cluster_labels)
        clustering_metric_dict[
            'fowlkes_mallows_score'] = fowlkes_mallows_score(
                ground_truth_labels, cluster_labels)
        clustering_metric_dict['homogeneity_score'] = homogeneity_score(
            ground_truth_labels, cluster_labels)
        clustering_metric_dict[
            'normalized_mutual_info_score'] = normalized_mutual_info_score(
                ground_truth_labels, cluster_labels)
        clustering_metric_dict['adjusted_rand_score'] = adjusted_rand_score(
            ground_truth_labels, cluster_labels)
        clustering_metric_dict['completeness_score'] = completeness_score(
            ground_truth_labels, cluster_labels)

    return clustering_metric_dict
示例#7
0
	def _score_clustering(self, labels,metric='vm'):
		# Score clustering compared to true model
		if metric=='fm':
			score = fowlkes_mallows_score(self.true_labels_, labels)
		elif metric=='ami':
			score = adjusted_mutual_info_score(self.true_labels_, labels)
		else:
			score = v_measure_score(self.true_labels_[labels>0], labels[labels>0])
		return score
示例#8
0
def _clustering_evaluation(label, labels_true, digits):
    if labels_true is None:
        FM = None
        ARI = None
    else:
        ARI = round(adjusted_rand_score(labels_true, label), digits)
        FM = round(fowlkes_mallows_score(labels_true, label),digits)

    return ARI, FM
def get_landmarking(dataset_name, df):
    start = time.time()
    record = {'dataset': dataset_name.split('.')[0]}
    results = []
    n_samples = int(len(df)*0.1) if len(df) > 400 else min(df.shape[0], 40)
    data = df.sample(n=n_samples, replace=False)
    labels = get_dbscan(data)
    k = len(np.unique(labels))
    labels2 = get_Kmeans(data, k, 40)
    full_tree = DecisionTreeClassifier()
    full_tree.fit(data, labels)
    worst_attr = np.argmin(full_tree.feature_importances_)

    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.3)
    best_stump = DecisionTreeClassifier(max_depth=1)
    random_stump = DecisionTreeClassifier(splitter="random", max_depth=1)
    worst_stump = DecisionTreeClassifier(max_depth=1)
    elite_knn = KNeighborsClassifier(n_neighbors=1)
    one_knn = KNeighborsClassifier(n_neighbors=1,
            algorithm="auto",
            weights="uniform",
            p=2,
            metric="minkowski")
    nb = GaussianNB()
    lda = LinearDiscriminantAnalysis()
    best_stump.fit(X_train, y_train)
    random_stump.fit(X_train, y_train)
    worst_stump.fit(X_train.iloc[:, worst_attr].values.reshape(-1, 1), y_train)
    elite_knn.fit(X_train, y_train)
    one_knn.fit(X_train, y_train)
    # lda.fit(X_train, y_train)
    nb.fit(X_train, y_train)

    record['LM1'] = np.log2(df.shape[0])
    record['LM2'] = np.log2(df.shape[1])
    record['LM3'] = accuracy_score(best_stump.predict(X_test), y_test)
    # record['LM4'] = f1_score(best_stump.predict(X_test), y_test, average='weighted')
    record['LM5'] = accuracy_score(random_stump.predict(X_test), y_test)
    # record['LM6'] = f1_score(random_stump.predict(X_test), y_test, average='weighted')
    # record['LM7'] = model.inertia_
    record['LM8'] = accuracy_score(elite_knn.predict(X_test), y_test)
    # record['LM9'] = f1_score(elite_knn.predict(X_test), y_test, average='weighted')
    # record['LM10'] = accuracy_score(lda.predict(X_test), y_test)
    # record['LM11'] = f1_score(lda.predict(X_test), y_test, average='weighted')
    record['LM12'] = accuracy_score(nb.predict(X_test), y_test)
    # record['LM13'] = f1_score(nb.predict(X_test), y_test, average='weighted')
    record['LM14'] = accuracy_score(one_knn.predict(X_test), y_test)
    # record['LM15'] = f1_score(one_knn.predict(X_test), y_test, average='weighted')
    record['LM16'] = accuracy_score(worst_stump.predict(X_test.iloc[:, worst_attr].values.reshape(-1, 1)), y_test)
    # record['LM17'] = f1_score(worst_stump.predict(X_test.iloc[:, worst_attr].values.reshape(-1, 1)), y_test, average='weighted')
    record['LM18'] = adjusted_rand_score(labels, labels2)
    record['LM19'] = adjusted_mutual_info_score(labels, labels2)
    record['LM20'] = completeness_score(labels, labels2)
    record['LM21'] = fowlkes_mallows_score(labels, labels2)

    end = time.time()
    return record, (df.shape[0], df.shape[1], end-start)
示例#10
0
def test_int_overflow_mutual_info_fowlkes_mallows_score():
    # Test overflow in mutual_info_classif and fowlkes_mallows_score
    x = np.array([1] * (52632 + 2529) + [2] * (14660 + 793) + [3] * (3271 +
                 204) + [4] * (814 + 39) + [5] * (316 + 20))
    y = np.array([0] * 52632 + [1] * 2529 + [0] * 14660 + [1] * 793 +
                 [0] * 3271 + [1] * 204 + [0] * 814 + [1] * 39 + [0] * 316 +
                 [1] * 20)

    assert_all_finite(mutual_info_score(x, y))
    assert_all_finite(fowlkes_mallows_score(x, y))
示例#11
0
def test_int_overflow_mutual_info_fowlkes_mallows_score():
    # Test overflow in mutual_info_classif and fowlkes_mallows_score
    x = np.array([1] * (52632 + 2529) + [2] * (14660 + 793) + [3] *
                 (3271 + 204) + [4] * (814 + 39) + [5] * (316 + 20))
    y = np.array([0] * 52632 + [1] * 2529 + [0] * 14660 + [1] * 793 +
                 [0] * 3271 + [1] * 204 + [0] * 814 + [1] * 39 + [0] * 316 +
                 [1] * 20)

    assert_all_finite(mutual_info_score(x, y))
    assert_all_finite(fowlkes_mallows_score(x, y))
def cluster_performance(y_true, y_pred):
    '''
    返回FM指数和Rand指数
    :param y_true:参考模型的簇划分,类型为ndarray
    :param y_pred:聚类模型给出的簇划分,类型为ndarray
    :return: FM指数,Rand指数
    '''
    FM = fowlkes_mallows_score(y_true, y_pred)
    Rand = adjusted_rand_score(y_true, y_pred)
    return FM, Rand
示例#13
0
def cluster_performance(y_true, y_pred):
    '''
    返回FM指数和Rand指数
    :param y_true:参考模型的簇划分,类型为ndarray
    :param y_pred:聚类模型给出的簇划分,类型为ndarray
    :return: FM指数,Rand指数
    '''

    #********* Begin *********#
    return fowlkes_mallows_score(y_true,
                                 y_pred), adjusted_rand_score(y_true, y_pred)
示例#14
0
def print_stats(x, y, quiet=True):
    ari = adjusted_rand_score(x, y)
    ami = adjusted_mutual_info_score(x, y)
    fms = fowlkes_mallows_score(x, y)

    if not quiet:
        print("ARI: {}".format(ari), file=sys.stderr)
        print("AMI: {}".format(ami), file=sys.stderr)
        print("FMS: {}".format(fms), file=sys.stderr)

    return ari, ami, fms
示例#15
0
def cluster_performance(y_true, y_pred):
    """
    返回Rand指数和FM指数
    :param y_true:参考模型的簇划分,类型为ndarray
    :param y_pred:聚类模型给出的簇划分,类型为ndarray
    :return: Rand指数,FM指数
    """
    # ********* Begin *********#
    rand = adjusted_rand_score(y_true, y_pred)
    fm = fowlkes_mallows_score(y_true, y_pred)
    return fm, rand
示例#16
0
def test_fowlkes_mallows_score_properties():
    # handcrafted example
    labels_a = np.array([0, 0, 0, 1, 1, 2])
    labels_b = np.array([1, 1, 2, 2, 0, 0])
    expected = 1. / np.sqrt((1. + 3.) * (1. + 2.))
    # FMI = TP / sqrt((TP + FP) * (TP + FN))

    score_original = fowlkes_mallows_score(labels_a, labels_b)
    assert_almost_equal(score_original, expected)

    # symmetric property
    score_symmetric = fowlkes_mallows_score(labels_b, labels_a)
    assert_almost_equal(score_symmetric, expected)

    # permutation property
    score_permuted = fowlkes_mallows_score((labels_a + 1) % 3, labels_b)
    assert_almost_equal(score_permuted, expected)

    # symmetric and permutation(both together)
    score_both = fowlkes_mallows_score(labels_b, (labels_a + 2) % 3)
    assert_almost_equal(score_both, expected)
示例#17
0
def test_fowlkes_mallows_score_properties():
    # handcrafted example
    labels_a = np.array([0, 0, 0, 1, 1, 2])
    labels_b = np.array([1, 1, 2, 2, 0, 0])
    expected = 1. / np.sqrt((1. + 3.) * (1. + 2.))
    # FMI = TP / sqrt((TP + FP) * (TP + FN))

    score_original = fowlkes_mallows_score(labels_a, labels_b)
    assert_almost_equal(score_original, expected)

    # symmetric property
    score_symmetric = fowlkes_mallows_score(labels_b, labels_a)
    assert_almost_equal(score_symmetric, expected)

    # permutation property
    score_permuted = fowlkes_mallows_score((labels_a + 1) % 3, labels_b)
    assert_almost_equal(score_permuted, expected)

    # symmetric and permutation(both together)
    score_both = fowlkes_mallows_score(labels_b, (labels_a + 2) % 3)
    assert_almost_equal(score_both, expected)
示例#18
0
def Porownaj_algorytmy2(data, klasy, labels, baza):
    """
    Oblicza indeksy AM, AR i FM dla algorytmu napisanego przeze mnie.
    """
    wektor =[]
    
    #moj algorytm
    wynikM = spectral_clustering(data, k=klasy, M=5)
    wektor.append([fowlkes_mallows_score(labels,wynikM), adjusted_mutual_info_score(labels, wynikM),adjusted_rand_score(labels,wynikM),baza ])
    
    index=["Moj"]
    
    return pd.DataFrame(wektor, index = index, columns = ["FM","AM","AR", "Dane"])
示例#19
0
 def evaluate(self):
     eval_result_dict = {}
     eval_result_dict['ami'] = adjusted_mutual_info_score(
         self.data['true_y'], self.data['pred_y'])
     eval_result_dict['rand'] = adjusted_rand_score(self.data['true_y'],
                                                    self.data['pred_y'])
     eval_result_dict['comp'] = completeness_score(self.data['true_y'],
                                                   self.data['pred_y'])
     eval_result_dict['fow'] = fowlkes_mallows_score(
         self.data['true_y'], self.data['pred_y'])
     eval_result_dict['hom'] = homogeneity_score(self.data['true_y'],
                                                 self.data['pred_y'])
     eval_result_dict['nmi'] = normalized_mutual_info_score(
         self.data['true_y'], self.data['pred_y'])
     eval_result_dict['v_score'] = v_measure_score(self.data['true_y'],
                                                   self.data['pred_y'])
     return eval_result_dict
示例#20
0
def cluster_hac(num_k):
    feature_ds, label_ds = read_dataset()

    user_max_id = num_k - 1
    sub_feature_ds = []
    sub_label_ds = []
    for i in range(0, len(label_ds)):
        if label_ds[i] <= user_max_id:
            sub_feature_ds.append(feature_ds[i])
            sub_label_ds.append(label_ds[i])

    feature_array = np.array(sub_feature_ds)

    x_scalar = StandardScaler()
    x = x_scalar.fit_transform(feature_array)

    pca = PCA(n_components=0.999)
    components = pca.fit_transform(x)
    hac = AgglomerativeClustering(n_clusters=num_k, linkage='average')
    hac.fit_predict(components)
    print(fowlkes_mallows_score(hac.labels_, sub_label_ds))
示例#21
0
def cluster_kmeans(num_k):
    feature_ds, label_ds = read_dataset()

    user_max_id = num_k - 1
    sub_feature_ds = []
    sub_label_ds = []
    for i in range(0, len(label_ds)):
        if label_ds[i] <= user_max_id:
            sub_feature_ds.append(feature_ds[i])
            sub_label_ds.append(label_ds[i])

    feature_array = np.array(sub_feature_ds)

    x_scalar = StandardScaler()
    x = x_scalar.fit_transform(feature_array)

    pca = PCA(n_components=0.999)
    components = pca.fit_transform(x)
    kmeans = KMeans(n_clusters=num_k, random_state=0)
    kmeans.fit_predict(components)
    print(fowlkes_mallows_score(kmeans.labels_, sub_label_ds))
示例#22
0
def compute_external_metrics(labels_true: List[str],
                             labels_pred: List[int]) -> ExternalEvaluation:
    if len(labels_true) == 0 and len(labels_pred) == 0:
        return None

    homogeneity, completeness, v_measure = homogeneity_completeness_v_measure(
        labels_true, labels_pred)
    adjusted_mutual_info = adjusted_mutual_info_score(labels_true, labels_pred)
    adjusted_rand_index = adjusted_rand_score(labels_true, labels_pred)
    fowlkes_mallows = fowlkes_mallows_score(labels_true, labels_pred)

    mat = contingency_matrix(labels_true, labels_pred)
    purity = purity_score(mat)
    inverse_purity = purity_score(mat, inverse=True)

    return ExternalEvaluation(homogeneity=homogeneity,
                              completeness=completeness,
                              v_measure=v_measure,
                              adjusted_mutual_information=adjusted_mutual_info,
                              adjusted_rand_index=adjusted_rand_index,
                              fowlkes_mallows=fowlkes_mallows,
                              purity=purity,
                              inverse_purity=inverse_purity)
示例#23
0
文件: func.py 项目: swqs1989/ds598
 def externalEval(self, y_pred, true_label):
     true_label = np.array(true_label)
     n_cluster = len(set(true_label))
     y_pred_modi = y_pred.copy()
     result = [[] for i in range(len(set(y_pred)))]
     for i in range(len(y_pred)):
         result[y_pred[i]].append(i)
     dict1 = dict.fromkeys([i for i in range(n_cluster)], None)
     for i in list(dict1.keys()):
         dict1[i] = []
     nummostnum = 0
     for i in range(len(result)):
         if len(true_label[result[i]]) > 0:
             mostnum = Counter(true_label[result[i]]).most_common(1)[0][0]
             nummostnum += Counter(
                 true_label[result[i]]).most_common(1)[0][1]
             dict1[mostnum] += (result[i])
     for r in list(dict1.keys()):
         for i in dict1[r]:
             y_pred_modi[i] = r
     nmi = normalized_mutual_info_score(true_label, y_pred)
     purity = nummostnum / len(y_pred_modi)
     fowlkes_mallows = fowlkes_mallows_score(true_label, y_pred_modi)
     return nmi, purity, fowlkes_mallows
def kmeans(data, k):

    centroid = initialize_centroids(data, k)
    a = np.zeros((k, k))
    b = np.zeros(k)
    c1 = np.zeros((k, k))
    d = np.zeros(k)
    clusnew = np.zeros(len(data))
    i = 1
    while (i < 100):
        clusters = closest_centroid(data, centroid, k)
        for l in range(0, k):
            centroid[l, :] = np.mean(data[(np.where(clusters == l))], axis=0)
        i = i + 1
        print i
    c = confusion_matrix(clusters, digits.target)
    for j in range(0, k):
        c1[j, :] = c[:, (np.argmax(c[j, :]))]
        clusnew[clusters == (np.argmax(c[j, :]))] = j
        d[j] = sum(c1[:, j])
    c1[:, (np.argmin(d))] = -1
    print('Confusion Matrix: ', c1)
    print('Fowlkes Mallows Score: ',
          fowlkes_mallows_score(digits.target, clusnew))
示例#25
0
output_file = sys.argv[1]
correct_file = sys.argv[2]

values = np.loadtxt(correct_file, dtype=int)
num_lines = sum(1 for line in open(correct_file))
result = np.zeros(num_lines)

cur_clus = -1

with open(output_file) as f:
    content = f.readlines()

for i in range(0, len(content)):
    if (content[i][0] == '#'):
        cur_clus += 1
        continue

    result[int(content[i])] = cur_clus

net_score = fowlkes_mallows_score(values, result)
'''
unique, counts = np.unique(result, return_counts=True)
print np.asarray((unique, counts)).T

unique, counts = np.unique(values, return_counts=True)
print np.asarray((unique, counts)).T
'''
logs = open('DBSCANLogs.txt', 'a')
logs.write(str(net_score) + '\n')
logs.close()
    def _eval_clustering(self, labels_true, labels_predicted):
        # To address when COP-KMeans fails to satisfy all constraints at a k:
        if labels_predicted is None:
            # return an empty dictionary to expose in the final output
            return {"nmi": None,
                    "ami": None,
                    "ari": None,
                    "fms": None,
                    "v_measure": None,
                    "bcubed_precision": None,
                    "bcubed_recall": None,
                    "bcubed_fscore": None,
                    "Silhouette": None,
                    "Calinski_harabasz": None,
                    "Davies_Bouldin": None
                    }

        nmi = normalized_mutual_info_score(labels_true,
                                           labels_predicted,
                                           average_method="max")

        ami = adjusted_mutual_info_score(labels_true,
                                         labels_predicted,
                                         average_method="arithmetic")

        ari = adjusted_rand_score(labels_true,
                                  labels_predicted)

        v_measure = v_measure_score(labels_true,
                                    labels_predicted,
                                    beta=1.0)

        fms = fowlkes_mallows_score(labels_true,
                                    labels_predicted)

        # Reshape labels for BCubed measures
        true_dict = self._reshape_labels_as_dicts(labels_true)
        pred_dict = self._reshape_labels_as_dicts(labels_predicted)

        bcubed_precision = bcubed.precision(cdict=pred_dict, ldict=true_dict)
        bcubed_recall = bcubed.recall(cdict=pred_dict, ldict=true_dict)
        bcubed_f1 = bcubed.fscore(bcubed_precision, bcubed_recall)

        # =====================================================================
        # Unsupervised Metrics
        # =====================================================================
        if not labels_predicted.nunique() in (1, len(self.data)):
            sil = silhouette_score(X=self.data,
                                   labels=labels_predicted,
                                   metric=self.distance_metric,
                                   random_state=13712)

            ch = calinski_harabasz_score(X=self.data, labels=labels_predicted)

            dv = davies_bouldin_score(X=self.data, labels=labels_predicted)
        else:
            sil = None
            ch = None
            dv = None

        ret = {}
        ret.update({"nmi": round(nmi, 4),
                    "ami": round(ami, 4),
                    "ari": round(ari, 4),
                    "fms": round(fms, 4),
                    "v_measure": round(v_measure, 4),
                    "bcubed_precision": round(bcubed_precision, 4),
                    "bcubed_recall": round(bcubed_recall, 4),
                    "bcubed_fscore": round(bcubed_f1, 4),
                    "Silhouette": round(sil, 4
                                        ) if sil is not None else None,
                    "Calinski_harabasz": round(ch, 4
                                               ) if ch is not None else None,
                    "Davies_Bouldin": round(dv, 4
                                            ) if dv is not None else None
                    # Here goes the unsupervised indices
                    })

        return ret
示例#27
0
def _fm(labels, labels_true,digits):
    return round(fowlkes_mallows_score(labels_true, labels),digits)
示例#28
0
data_copy = copy.copy(data)

# Drop the class
inputs = data.drop('species', axis=1)

# Test from n_clusters = 2 until n_clusters = 6
for n_clusters in range(2, 6 + 1):
    # Fowkes-Mallows and Silhouette evaluation:
    agglo = Agglomerative(n_clusters=n_clusters)
    agglo.fit(inputs)
    labels = np.array(agglo.predict(inputs))

    print("n_clusters =", n_clusters)

    print("Menggunakan metode Fowlkes-Mallows: ")
    fowlkes_mallows = fowlkes_mallows_score(labels, target)
    print("Fowlkes Mallows Score:", fowlkes_mallows)

    print("Menggunakan metode Silhouette:")
    silhouette_avg = silhouette_score(inputs, labels)
    print("Hasil rata-rata skor silhouette:", silhouette_avg)
    print()
    print()

    silhouette_values_per_point = silhouette_samples(inputs, labels)

    # Visualize Silhouette subplot
    # 1 row and 2 columns: Left -> silhouette plot and Right -> Cluster Visualization
    fig, silhouette_viz = plt.subplots(1)
    fig.set_size_inches(18, 7)
示例#29
0
def report_clustering(distance_file,
                      biom_file,
                      metadata_file,
                      num_clusters,
                      verbose,
                      L=2,
                      output_file=None):
    if not isinstance(distance_file, list):
        distance_matrix = CSV.read(distance_file)
    else:
        distance_matrix = distance_file

    if output_file is not None:
        f = open(output_file, 'w')

    output_matrix = []

    AgglomerativeCluster = AgglomerativeClustering(
        n_clusters=num_clusters, affinity='precomputed',
        linkage='complete').fit_predict(distance_matrix)
    KMedoidsCluster = KMedoids(n_clusters=num_clusters,
                               metric='precomputed',
                               method='pam',
                               init='heuristic').fit_predict(distance_matrix)

    PCoA_Samples = BW.extract_samples(biom_file)
    metadata = meta.extract_metadata(metadata_file)
    region_names = []
    for i in range(len(PCoA_Samples)):
        if metadata[PCoA_Samples[i]]['body_site'] not in region_names:
            region_names.append(metadata[PCoA_Samples[i]]['body_site'])
        PCoA_Samples[i] = region_names.index(
            metadata[PCoA_Samples[i]]['body_site'])

    if verbose and L == 1:
        print('Printing results for L1-UniFrac:')
    elif verbose and L == 2:
        print('Printing results for L2-UniFrac:')
    if verbose:
        print('Metric\t\t\t\t\t\t\tAgglomerativeClustering\t\tKMedoids')

    if output_file is not None:
        if L == 1:
            f.write('Printing results for L1-UniFrac:\n')
        elif L == 2:
            f.write('Printing results for L2-UniFrac:\n')
        f.write('Metric\t\t\t\tAgglomerativeClustering\t\t\tKMedoids\n')

    if L == 1:
        output_matrix.append(['Printing results for L1-UniFrac:'])
    if L == 2:
        output_matrix.append(['Printing results for L2-UniFrac:'])
    output_matrix.append(['Metric', 'AgglomerativeClustering', 'KMedoids'])

    RI1 = rand_score(PCoA_Samples, AgglomerativeCluster)
    RI2 = rand_score(PCoA_Samples, KMedoidsCluster)
    if verbose:
        print(f'Rand Index Score:               {RI1}\t\t\t{RI2}')
    ARI1 = adjusted_rand_score(PCoA_Samples, AgglomerativeCluster)
    ARI2 = adjusted_rand_score(PCoA_Samples, KMedoidsCluster)
    if verbose:
        print(f'Adjusted Rand Index Score:      {ARI1}\t\t\t{ARI2}')
    NMI1 = normalized_mutual_info_score(PCoA_Samples, AgglomerativeCluster)
    NMI2 = normalized_mutual_info_score(PCoA_Samples, KMedoidsCluster)
    if verbose:
        print(f'Normalized Mutual Index Score:  {NMI1}\t\t\t{NMI2}')
    AMI1 = adjusted_mutual_info_score(PCoA_Samples, AgglomerativeCluster)
    AMI2 = adjusted_mutual_info_score(PCoA_Samples, KMedoidsCluster)
    if verbose:
        print(f'Adjusted Mutual Info Score:     {AMI1}\t\t\t{AMI2}')
    FM1 = fowlkes_mallows_score(PCoA_Samples, AgglomerativeCluster)
    FM2 = fowlkes_mallows_score(PCoA_Samples, KMedoidsCluster)
    if verbose:
        print(f'Fowlkes Mallows Score:          {FM1}\t\t\t{FM2}')

    if output_file is not None:
        f.write(f'Rand Index Score:               {RI1}\t\t\t{RI2}\n')
        f.write(f'Adjusted Rand Index Score:      {ARI1}\t\t\t{ARI2}\n')
        f.write(f'Normalized Mutual Index Score:  {NMI1}\t\t\t{NMI2}\n')
        f.write(f'Adjusted Mutual Info Score:     {AMI1}\t\t\t{AMI2}\n')
        f.write(f'Fowlkes Mallows Score:          {FM1}\t\t\t{FM2}\n')

    output_matrix.append(['Rand Index Score:', RI1, RI2])
    output_matrix.append(['Adjusted Rand Index Score:', ARI1, ARI2])
    output_matrix.append(['Normalized Mutual Index Score:', NMI1, NMI2])
    output_matrix.append(['Adjusted Mutual Info Score:', AMI1, AMI2])
    output_matrix.append(['Fowlkes Mallows Score:', FM1, FM2])

    return output_matrix
示例#30
0
from time import time
import numpy as np
from scipy import ndimage
from matplotlib import pyplot as plt
from sklearn import manifold, datasets
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import confusion_matrix
from sklearn.metrics.cluster import fowlkes_mallows_score
from sklearn.preprocessing import scale
digits = datasets.load_digits(n_class=10)
X = scale(digits.data)
y = digits.target
n_samples, n_features = X.shape
X_red = manifold.SpectralEmbedding(n_components=2).fit_transform(X)
clusnew = np.zeros(len(X))
clustering = AgglomerativeClustering(linkage='ward', n_clusters=10)
t0 = time()
clustering.fit(X_red)
print("%s : %.2fs" % ('ward', time() - t0))
c1 = np.zeros((10, 10))
d = np.zeros(10)
c = confusion_matrix(clustering.labels_, y)
for j in range(0, 10):
    c1[j, :] = c[:, (np.argmax(c[j, :]))]
    clusnew[clustering.labels_ == (np.argmax(c[j, :]))] = j
    d[j] = sum(c1[:, j])
c1[:, (np.argmin(d))] = -1
print('Confusion Matrix: ', c1)
print('Fowlkes Mallows Score: ', fowlkes_mallows_score(y, clusnew))
示例#31
0
with open(fi, 'w') as outfile:
    json.dump(label_test.tolist(), outfile)
fi = os.getcwd() + "/svm/json/label_train.json"
with open(fi, 'w') as outfile:
    json.dump(label_train.tolist(), outfile)

print "\nLinear SVC: "
Classifier = svm.SVC(kernel='linear', probability=True)
Classifier.fit(feature_train, label_train)
joblib.dump(Classifier, 'linear_2.pkl')
print "predicting.."
predict = Classifier.predict(feature_test)
print "Expected output:", label_test
print "Predicted output:", predict
print "Confusion Matrix:\n", metrics.confusion_matrix(label_test, predict)
print "Fowlkes Mallows Score", fowlkes_mallows_score(label_test, predict)

try:
    print "Precision Score", precision_score(label_test, predict)
    print "Recall Score", recall_score(label_test, predict)
    print "F-measure", f1_score(label_test, predict)
    # exit()
except:
    pass

print "\nRBF SVC: "
Classifier = svm.SVC(kernel='rbf')
Classifier.fit(feature_train, label_train)
joblib.dump(Classifier, 'rbf_2.pkl')
print "predicting.."
predict = Classifier.predict(feature_test)
示例#32
0
    def compute_scores(self, x):

        self.cluster_labels = np.ndarray((x.shape[0], ))

        for i in range(0, x.shape[0], self.batch_size):
            predictions = self.kmeans.predict(x[i:(i + self.batch_size)])
            self.cluster_labels[i:(i + self.batch_size)] = predictions

        if (i + self.batch_size) > x.shape[0]:
            predictions = self.kmeans.predict(x[i:x.shape[0]])
            self.cluster_labels[i:x.shape[0]] = predictions

        confusion_matrix = cscores.contingency_matrix(self.labels_true,
                                                      self.labels_pred)
        purity_score = np.sum(np.amax(confusion_matrix,
                                      axis=0)) / np.sum(confusion_matrix)
        homogeneity_score, completeness_score, v_measure_score = cscores.homogeneity_completeness_v_measure(
            self.labels_true, self.labels_pred)

        scores = [
            #['calinski_harabasz_score', 'internal', cscores.calinski_harabasz_score(x, self.cluster_labels)],
            [
                'davies_bouldin_score', 'internal',
                metrics.davies_bouldin_score(x, self.cluster_labels)
            ],
            [
                'silhouette_score', 'internal',
                metrics.silhouette_score(x, self.cluster_labels)
            ],
            #['silhouette_samples', 'internal', cscores.silhouette_samples(x, self.cluster_labels)],
            ['purity_score', 'external', purity_score],
            [
                'adjusted_rand_score', 'external',
                cscores.adjusted_rand_score(self.labels_true, self.labels_pred)
            ],
            ['completeness_score', 'external', completeness_score],
            [
                'fowlkes_mallows_score', 'external',
                cscores.fowlkes_mallows_score(self.labels_true,
                                              self.labels_pred)
            ],
            ['homogeneity_score', 'external', homogeneity_score],
            [
                'adjusted_mutual_info_score', 'external',
                cscores.adjusted_mutual_info_score(self.labels_true,
                                                   self.labels_pred)
            ],
            [
                'mutual_info_score', 'external',
                cscores.mutual_info_score(self.labels_true, self.labels_pred)
            ],
            [
                'normalized_mutual_info_score', 'external',
                cscores.normalized_mutual_info_score(self.labels_true,
                                                     self.labels_pred)
            ],
            ['v_measure_score', 'external', v_measure_score]
        ]

        scores = pd.DataFrame(scores, columns=['name', 'type', 'score'])
        scores.to_csv(files.small_images_classes_kmeans_scores, index=False)
示例#33
0
	n_samples, n_features = X.shape
	np.random.seed(0)
	k=10
	labels_y = list(set(y))

	print 50*"_"
	print "KMeans clustering (implementation of algo from question 1a)"
	no_of_iterations = 10
	dat = {i:0 for i in range(n_samples)} 
	t0 = time()
	k_centers, dat = k_means(X, dat, k,no_of_iterations) 
	y_pred1 = [value for key,value in dat.iteritems()]
	c_m1 = confusion_matrix(y,y_pred1, labels_y)	
	print "PROTOCOL1: The cluster predictions for 10 clusters, i.e k = 10 are:\n",getClusterRepresentatives(c_m1, k)
	print "PROTOCOL2: Confusion Matrix: \n",c_m1
	print "PROTOCOL3: Fowlkes-Mallows score:", fowlkes_mallows_score(y, y_pred1)
	print "Time taken: %.2fs" % (time() - t0)

	print 50*"_"
	print "KMeans clustering (using sklearn)"
	clustering1 = KMeans(n_clusters=k, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances='auto', verbose=0, random_state=None, copy_x=True, n_jobs=1, algorithm='auto')
	t01 = time()
	y_pred11 = clustering1.fit_predict(X)
	c_m11 = confusion_matrix(y,y_pred11)	
	print "PROTOCOL1: The cluster predictions for 10 clusters, i.e k = 10 are:\n",getClusterRepresentatives(c_m11, k)
	print "PROTOCOL2: Confusion Matrix: \n",c_m11
	print "PROTOCOL3: Fowlkes-Mallows score:", fowlkes_mallows_score(y, y_pred11)
	print "Time taken: %.2fs" % (time() - t01)

	print 50*"_"
	print "Agglomerative Clustering with Ward linkage"