示例#1
0
def test_pair_confusion_matrix_single_cluster():
    # edge case: only one cluster
    N = 100
    clustering1 = np.zeros((N,))
    clustering2 = clustering1
    expected = np.array([[0, 0], [0, N * (N - 1)]])
    assert_array_equal(pair_confusion_matrix(clustering1, clustering2), expected)
示例#2
0
def test_pair_confusion_matrix_fully_dispersed():
    # edge case: every element is its own cluster
    N = 100
    clustering1 = list(range(N))
    clustering2 = clustering1
    expected = np.array([[N * (N - 1), 0], [0, 0]])
    assert_array_equal(pair_confusion_matrix(clustering1, clustering2), expected)
示例#3
0
def print_acc(label_true, label_pred):
    matrix = pair_confusion_matrix(label_true, label_pred)
    print(matrix)
    acc = (matrix[0][0] + matrix[1][1])/sum(sum(matrix))
    print(acc)

    label_true = np.array(label_true)
    label_pred = np.array(label_pred)
    print(acc_score(label_true, label_pred))
示例#4
0
def get_rand_index_and_f_measure(labels_true, labels_pred, beta=1.):
    (tn, fp), (fn, tp) = pair_confusion_matrix(labels_true, labels_pred)

    ri = (tp + tn) / (tp + tn + fp + fn)
    ari = 2. * (tp * tn - fn * fp) / ((tp + fn) * (fn + tn) + (tp + fp) *
                                      (fp + tn))
    p, r = tp / (tp + fp), tp / (tp + fn)
    f_beta = (1 + beta**2) * (p * r / ((beta**2) * p + r))
    return ri, ari, f_beta
示例#5
0
def test_pair_confusion_matrix():
    # regular case: different non-trivial clusterings
    n = 10
    N = n ** 2
    clustering1 = np.hstack([[i + 1] * n for i in range(n)])
    clustering2 = np.hstack([[i + 1] * (n + 1) for i in range(n)])[:N]
    # basic quadratic implementation
    expected = np.zeros(shape=(2, 2), dtype=np.int64)
    for i in range(len(clustering1)):
        for j in range(len(clustering2)):
            if i != j:
                same_cluster_1 = int(clustering1[i] == clustering1[j])
                same_cluster_2 = int(clustering2[i] == clustering2[j])
                expected[same_cluster_1, same_cluster_2] += 1
    assert_array_equal(pair_confusion_matrix(clustering1, clustering2), expected)
def gera_estatisticas(dataset, algo, estastisticas_gerais):
    caminho_resultados = "dados/mock/" + dataset + "/"
    path_verdade = caminho_resultados + dataset + "GT.csv"
    path_particao =  caminho_resultados + algo + "_" + dataset + "/" + algo + "_" + dataset + "_labels.csv"
    contagem_de_pares = ContagemParesOnline(dataset + '_labels_verdadeiro', algo)
    labels_verdadeiro = readCsv(path_verdade, None).to_numpy()
    labels_particao = readCsv(path_particao, None).to_numpy()
    labels_verdadeiro = labels_verdadeiro.flatten() - 1
    labels_particao = labels_particao.flatten()
    sk_rand_score = rand_score(labels_verdadeiro, labels_particao)
    sk_pair_confusion_matrix = pair_confusion_matrix(labels_verdadeiro, labels_particao)
    sk_contigencia = contingency_matrix(labels_verdadeiro, labels_particao)
    rand_index = []
    df = []
    for index in range(len(labels_verdadeiro)):
        contagem_de_pares.atualiza(labels_verdadeiro[index], labels_particao[index])
        rand_index.append(contagem_de_pares.rand_index)
        media = sta.mean(rand_index)
        desvio_padrao = sta.pstdev(rand_index)
        dados = [contagem_de_pares.rand_index, media, desvio_padrao]
        df.append(dados)
    dados_rand_index = pd.DataFrame(list(map(np.ravel, rand_index)))
    dados_estatististicos = pd.DataFrame(list(map(np.ravel, df)))
    caminho = "resultados/" + dataset + "/" + algo + "_" + dataset + "/"
    os.makedirs(os.path.dirname(caminho), exist_ok=True)
    nomeArquivo =  algo + "_" + dataset + "_ri"
    dados_rand_index.to_csv(caminho + nomeArquivo + '.csv', index=False, header=False)
    nomeArquivo_dados = algo + "_" + dataset + "_estastisticas"
    dados_estatististicos.to_csv(caminho + nomeArquivo_dados + '.csv', index=False, header=['rand_index', 'media', 'desvio_padrao'])
    print("RI Python {} {}: {}".format(dataset, algo, sk_rand_score))
    print("N's Python {} {}: \n{}".format(dataset, algo, sk_pair_confusion_matrix))
    print("Contigencia's Python {} {}: \n{}".format(dataset, algo, sk_contigencia))
    print("RI {} {}: {}".format(dataset, algo, contagem_de_pares.rand_index))
    print("N's {} {}: \n{}".format(dataset, algo, contagem_de_pares.matriz_confusao_pares))
    print("Contigencia's {} {}: \n{}".format(dataset, algo, contagem_de_pares.matriz_confusao))
    print('Média: {}'.format(media))
    print('Desvio Padrao: {}'.format(desvio_padrao))
    estastisticas_gerais.append([dataset, algo, contagem_de_pares.rand_index, media, desvio_padrao])
        dist = self.distance(self.centroids, x)
        return np.argmin(dist, axis=1) + 1


clf = KMeansClassifier(clusters=3)
clf.fit(X.values)

predicted = clf.predict(X.values)

remap = {3: 3, 2: 1, 1: 2}
predicted = np.array(list(map(lambda x: remap[x], predicted)))
predicted, y.values.reshape(X.values.shape[0])

draw_clusters(Xr, predicted.reshape(y.values.shape), ['red', 'pink', 'blue'])

cm = pair_confusion_matrix(y.values.T[0], predicted)


def AdjRand(cm):
    (TN, FP), (FN, TP) = cm
    # print(FP, FN, TP, TN, TP + FN, TP + FN + FP + TN)

    return 2. * (TP * TN - FN * FP) / ((TP + FN) * (FN + TN) + (TP + FP) *
                                       (FP + TN))
    # return (TP + FN) / (TP + FN + FP + TN)


AdjRand(cm)


def inter_cluster_distances(labels, distances):