def test_adjusted_mutual_info_score():
    # Compute the Adjusted Mutual Information and test against known values
    labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3])
    labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2])
    # Mutual information
    mi = mutual_info_score(labels_a, labels_b, log_base='e')
    assert_almost_equal(mi, 0.41022, 5)
    # with provided sparse contingency
    C = contingency_matrix(labels_a, labels_b, sparse=True)
    mi = mutual_info_score(labels_a, labels_b, contingency=C, log_base='e')
    assert_almost_equal(mi, 0.41022, 5)
    # with provided dense contingency
    C = contingency_matrix(labels_a, labels_b)
    mi = mutual_info_score(labels_a, labels_b, contingency=C, log_base='e')
    assert_almost_equal(mi, 0.41022, 5)
    # Expected mutual information
    n_samples = C.sum()
    emi = expected_mutual_information(C, n_samples, log_base='e')
    assert_almost_equal(emi, 0.15042, 5)
    # Adjusted mutual information
    ami = adjusted_mutual_info_score(labels_a, labels_b, log_base='e')
    assert_almost_equal(ami, 0.27502, 5)
    ami = adjusted_mutual_info_score([1, 1, 2, 2], [2, 2, 3, 3])
    assert_equal(ami, 1.0)
    # Test with a very large array
    a110 = np.array([list(labels_a) * 110]).flatten()
    b110 = np.array([list(labels_b) * 110]).flatten()
    ami = adjusted_mutual_info_score(a110, b110, log_base='e')
    # This is not accurate to more than 2 places
    assert_almost_equal(ami, 0.37, 2)
예제 #2
0
def test_adjusted_mutual_info_score():
    # Compute the Adjusted Mutual Information and test against known values
    labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3])
    labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2])
    # Mutual information
    mi = mutual_info_score(labels_a, labels_b)
    assert_almost_equal(mi, 0.41022, 5)
    # with provided sparse contingency
    C = contingency_matrix(labels_a, labels_b, sparse=True)
    mi = mutual_info_score(labels_a, labels_b, contingency=C)
    assert_almost_equal(mi, 0.41022, 5)
    # with provided dense contingency
    C = contingency_matrix(labels_a, labels_b)
    mi = mutual_info_score(labels_a, labels_b, contingency=C)
    assert_almost_equal(mi, 0.41022, 5)
    # Expected mutual information
    n_samples = C.sum()
    emi = expected_mutual_information(C, n_samples)
    assert_almost_equal(emi, 0.15042, 5)
    # Adjusted mutual information
    ami = adjusted_mutual_info_score(labels_a, labels_b)
    assert_almost_equal(ami, 0.27821, 5)
    ami = adjusted_mutual_info_score([1, 1, 2, 2], [2, 2, 3, 3])
    assert ami == pytest.approx(1.0)
    # Test with a very large array
    a110 = np.array([list(labels_a) * 110]).flatten()
    b110 = np.array([list(labels_b) * 110]).flatten()
    ami = adjusted_mutual_info_score(a110, b110)
    assert_almost_equal(ami, 0.38, 2)
예제 #3
0
def test_adjusted_mutual_info_score():
    # Compute the Adjusted Mutual Information and test against known values
    labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3])
    labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2])
    # Mutual information
    mi = mutual_info_score(labels_a, labels_b)
    assert_almost_equal(mi, 0.41022, 5)
    # with provided sparse contingency
    C = contingency_matrix(labels_a, labels_b, sparse=True)
    mi = mutual_info_score(labels_a, labels_b, contingency=C)
    assert_almost_equal(mi, 0.41022, 5)
    # with provided dense contingency
    C = contingency_matrix(labels_a, labels_b)
    mi = mutual_info_score(labels_a, labels_b, contingency=C)
    assert_almost_equal(mi, 0.41022, 5)
    # Expected mutual information
    n_samples = C.sum()
    emi = expected_mutual_information(C, n_samples)
    assert_almost_equal(emi, 0.15042, 5)
    # Adjusted mutual information
    ami = adjusted_mutual_info_score(labels_a, labels_b)
    assert_almost_equal(ami, 0.27502, 5)
    ami = adjusted_mutual_info_score([1, 1, 2, 2], [2, 2, 3, 3])
    assert_equal(ami, 1.0)
    # Test with a very large array
    a110 = np.array([list(labels_a) * 110]).flatten()
    b110 = np.array([list(labels_b) * 110]).flatten()
    ami = adjusted_mutual_info_score(a110, b110)
    # This is not accurate to more than 2 places
    assert_almost_equal(ami, 0.37, 2)
예제 #4
0
def test_v_measure_and_mutual_information(seed=36):
    # Check relation between v_measure, entropy and mutual information
    for i in np.logspace(1, 4, 4).astype(np.int):
        random_state = np.random.RandomState(seed)
        labels_a, labels_b = random_state.random_integers(0, 10, i),\
            random_state.random_integers(0, 10, i)
        assert_almost_equal(v_measure_score(labels_a, labels_b),
                            2.0 * mutual_info_score(labels_a, labels_b) /
                            (entropy(labels_a) + entropy(labels_b)), 0)
예제 #5
0
def test_v_measure_and_mutual_information(seed=36):
    # Check relation between v_measure, entropy and mutual information
    for i in np.logspace(1, 4, 4).astype(np.int):
        random_state = np.random.RandomState(seed)
        labels_a, labels_b = (random_state.randint(0, 10, i),
                              random_state.randint(0, 10, i))
        assert_almost_equal(v_measure_score(labels_a, labels_b),
                            2.0 * mutual_info_score(labels_a, labels_b) /
                            (entropy(labels_a) + entropy(labels_b)), 0)
def test_int_overflow_mutual_info_score():
    # Test overflow in mutual_info_classif
    x = np.array([1] * (52632 + 2529) + [2] * (14660 + 793) + [3] *
                 (3271 + 204) + [4] * (814 + 39) + [5] * (316 + 20))
    y = np.array([0] * 52632 + [1] * 2529 + [0] * 14660 + [1] * 793 +
                 [0] * 3271 + [1] * 204 + [0] * 814 + [1] * 39 + [0] * 316 +
                 [1] * 20)

    assert_all_finite(mutual_info_score(x.ravel(), y.ravel(), log_base='e'))
예제 #7
0
def test_v_measure_and_mutual_information(seed=36):
    """Check relation between v_measure, entropy and mutual information"""
    for i in np.logspace(1, 4, 4):
        random_state = np.random.RandomState(seed)
        labels_a, labels_b = random_state.random_integers(0, 10, i),\
            random_state.random_integers(0, 10, i)
        assert_almost_equal(v_measure_score(labels_a, labels_b),
                            2.0 * mutual_info_score(labels_a, labels_b) /
                            (entropy(labels_a) + entropy(labels_b)), 0)
예제 #8
0
def test_int_overflow_mutual_info_score():
    # Test overflow in mutual_info_classif
    x = np.array([1] * (52632 + 2529) + [2] * (14660 + 793) + [3] * (3271 +
                 204) + [4] * (814 + 39) + [5] * (316 + 20))
    y = np.array([0] * 52632 + [1] * 2529 + [0] * 14660 + [1] * 793 +
                 [0] * 3271 + [1] * 204 + [0] * 814 + [1] * 39 + [0] * 316 +
                 [1] * 20)

    assert_all_finite(mutual_info_score(x.ravel(), y.ravel()))
def _compute_mi(x, y, x_discrete, y_discrete, n_neighbors=3):

    if x_discrete and y_discrete:
        return mutual_info_score(x, y)
    elif x_discrete and not y_discrete:
        return _compute_mi_cd(y, x, n_neighbors)
    elif not x_discrete and y_discrete:
        return _compute_mi_cd(x, y, n_neighbors)
    else:
        return _compute_mi_cc(x, y, n_neighbors)
예제 #10
0
def test_int_overflow_mutual_info_fowlkes_mallows_score():
    # Test overflow in mutual_info_classif and fowlkes_mallows_score
    x = np.array([1] * (52632 + 2529) + [2] * (14660 + 793) + [3] *
                 (3271 + 204) + [4] * (814 + 39) + [5] * (316 + 20))
    y = np.array([0] * 52632 + [1] * 2529 + [0] * 14660 + [1] * 793 +
                 [0] * 3271 + [1] * 204 + [0] * 814 + [1] * 39 + [0] * 316 +
                 [1] * 20)

    assert_all_finite(mutual_info_score(x, y))
    assert_all_finite(fowlkes_mallows_score(x, y))
def test_v_measure_and_mutual_information(seed=36):
    # Check relation between v_measure, entropy and mutual information
    for i in np.logspace(1, 4, 4).astype(np.int):
        random_state = np.random.RandomState(seed)
        labels_a, labels_b = (random_state.randint(0, 10, i),
                              random_state.randint(0, 10, i))

        v_m = v_measure_score(labels_a, labels_b)
        mi = mutual_info_score(labels_a, labels_b, log_base='e')
        h_a = entropy(labels_a, log_base='e')
        h_b = entropy(labels_b, log_base='e')
        assert_almost_equal(v_m, 2.0 * mi / (h_a + h_b), 0)
예제 #12
0
def _compute_mi(x, y, x_discrete, y_discrete, n_neighbors=3):
    """Compute mutual information between two variables.
    This is a simple wrapper which selects a proper function to call based on
    whether `x` and `y` are discrete or not.
    """
    if x_discrete and y_discrete:
        return mutual_info_score(x, y)
    elif x_discrete and not y_discrete:
        return _compute_mi_cd(y, x, n_neighbors)
    elif not x_discrete and y_discrete:
        return _compute_mi_cd(x, y, n_neighbors)
    else:
        return _compute_mi_cc(x, y, n_neighbors)
def test_v_measure_and_mutual_information(seed=36):
    # Check relation between v_measure, entropy and mutual information
    for i in np.logspace(1, 4, 4).astype(int):
        random_state = np.random.RandomState(seed)
        labels_a, labels_b = (random_state.randint(0, 10, i),
                              random_state.randint(0, 10, i))
        assert_almost_equal(v_measure_score(labels_a, labels_b),
                            2.0 * mutual_info_score(labels_a, labels_b) /
                            (entropy(labels_a) + entropy(labels_b)), 0)
        avg = 'arithmetic'
        assert_almost_equal(v_measure_score(labels_a, labels_b),
                            normalized_mutual_info_score(labels_a, labels_b,
                                                         average_method=avg)
                            )
def sklearn_measures(U, V):
    #     http://scikit-learn.org/stable/modules/classes.html#clustering-metrics
    import sklearn.metrics.cluster as sym
    U_labels = np.nonzero(U)[1]
    V_labels = np.nonzero(V)[1]
    print U_labels, V_labels
    #     V2_labels = np.nonzero(V2)[1]
    print 'entro(U)=', sym.entropy(U_labels), 'entro(V)=', sym.entropy(
        V_labels), 'entro(U,V)=', sym.mutual_info_score(U_labels, V_labels)
    res = [ ['ari', 'nmi', 'ami', 'vm' ], \
            [ sym.adjusted_rand_score(U_labels, V_labels),\
              sym.normalized_mutual_info_score(U_labels, V_labels),\
              sym.adjusted_mutual_info_score(U_labels, V_labels),\
              sym.v_measure_score(U_labels, V_labels)]]
    print res
    return res
예제 #15
0
파일: vj_diff.py 프로젝트: kmayerb/tcrdist2
def _ami(ab_cts, average_method='arithmetic'):
    """Adjusted mutual information between two discrete categorical random variables
    based on counts observed and provided in ab_cts.

    Code adapted directly from scikit learn AMI to
    accomodate having counts/contingency table instead of rows/instances:
    https://scikit-learn.org/stable/modules/generated/sklearn.metrics.adjusted_mutual_info_score.html

    Parameters
    ----------
    ab_cts : np.ndarray [len(a_classes) x len(b_classes)
        Counts for each combination of classes in random variables a and b
        organized in a rectangular array.
    average_method : str
        See sklearn documentation for details

    Returns
    -------
    ami : float
        Adjusted mutual information score for variables a and b"""
    a_freq = np.sum(ab_cts, axis=1)
    a_freq = a_freq / np.sum(a_freq)
    b_freq = np.sum(ab_cts, axis=0)
    b_freq = b_freq / np.sum(b_freq)
    n_samples = np.sum(ab_cts)
    """ Calculate the MI for the two clusterings
    contingency is a joint count distribution [a_classes x b_classes]"""
    mi = mutual_info_score(None, None, contingency=ab_cts)
    """Calculate the expected value for the mutual information"""
    emi = expected_mutual_information(ab_cts, n_samples)
    """Calculate entropy"""
    h_true, h_pred = _entropy(a_freq), _entropy(b_freq)
    normalizer = _generalized_average(h_true, h_pred, average_method)
    denominator = normalizer - emi

    if denominator < 0:
        denominator = min(denominator, -np.finfo('float64').eps)
    else:
        denominator = max(denominator, np.finfo('float64').eps)
    ami = (mi - emi) / denominator
    return ami
예제 #16
0
def test_adjusted_mutual_info_score():
    """Compute the Adjusted Mutual Information and test against known values"""
    labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3])
    labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2])
    # Mutual information
    mi = mutual_info_score(labels_a, labels_b)
    assert_almost_equal(mi, 0.41022, 5)
    # Expected mutual information
    C = contingency_matrix(labels_a, labels_b)
    n_samples = np.sum(C)
    emi = expected_mutual_information(C, n_samples)
    assert_almost_equal(emi, 0.15042, 5)
    # Adjusted mutual information
    ami = adjusted_mutual_info_score(labels_a, labels_b)
    assert_almost_equal(ami, 0.27502, 5)
    ami = adjusted_mutual_info_score([1, 1, 2, 2], [2, 2, 3, 3])
    assert_equal(ami, 1.0)
    # Test with a very large array
    a110 = np.array([list(labels_a) * 110]).flatten()
    b110 = np.array([list(labels_b) * 110]).flatten()
    ami = adjusted_mutual_info_score(a110, b110)
    # This is not accurate to more than 2 places
    assert_almost_equal(ami, 0.37, 2)
예제 #17
0
def test_mutual_info_score_positive_constant_label(labels_true, labels_pred):
    # non-regression test for #16355
    assert mutual_info_score(labels_true, labels_pred) >= 0
예제 #18
0
def test_mutual_info_score_positive_constant_label(labels_true, labels_pred):
    # Check that MI = 0 when one or both labelling are constant
    # non-regression test for #16355
    assert mutual_info_score(labels_true, labels_pred) == 0
예제 #19
0
#
#######
## Step1. Initialization
#######

f = np.loadtxt(output_dir+'/'+filename+'_dat.txt')
cls = np.loadtxt(output_dir+'/'+filename+'_cls.txt')
s=[]
mi_stack=[]
fi=range(len(f.T))

########
## Step2 & 3. Compute MI w.r.t classes and find the 1st feature:
########
for i in range(len(f.T)):
    mi = mutual_info_score(f.T[i],cls)
    mi_stack.append(mi)
print 'Evaluating the normalized mutual information coefficient w.r.t. the classes'

max=np.max(mi_stack)

for i in range(len(f.T)):
    if mi_stack[i] == max:
        s.append(i)
        fi.remove(i)
print '(max_mi,max_mi_index):', (max, s[0])
print 'The rest feature index:', fi

###########
## Step4. Greedy Selection: Repeat unitl |S|=k.
## a) Calculate MI btw features: I(f_i;f_s) for all pairs (f_i,f_s).
예제 #20
0
def CalPred(dataset, K, r, Probabilities, Predictions, valid_data):
    #    Bags_K = np.zeros((len(dataset), dataset.shape[1]))
    for k in range(K):
        samples = random.sample(range(0, len(dataset)),
                                int(0.632 * len(dataset)))
        Bags_K = np.zeros((len(samples), dataset.shape[1]))
        Bags_K = dataset.iloc[samples, :]

        prob_x_1 = (dataset[dataset == 1].count(axis=0) + 2) / (len(dataset) +
                                                                4)
        prob_x_0 = 1 - prob_x_1
        # len(acc) - acc.groupby(0)[1].sum()

        M_info = np.zeros((len(Bags_K.columns), len(Bags_K.columns)))
        random1 = random.sample(range(0, len(Bags_K.columns)), r * 2)
        temp1 = random1[r:]
        temp2 = random1[:r]

        from sklearn.metrics.cluster import mutual_info_score
        for i in Bags_K.columns:
            #        print(i)
            for j in Bags_K.columns:

                M_info[i][j] = mutual_info_score(Bags_K[i].values,
                                                 Bags_K[j].values)

        for i in temp1:
            for j in temp2:
                M_info[i][j] = 0

        from scipy.sparse import csr_matrix, find
        from scipy.sparse.csgraph import minimum_spanning_tree, depth_first_tree

        X = csr_matrix(M_info)
        Tcsr = -minimum_spanning_tree(-X)
        #    print(Tcsr)
        #    Array1 = Tcsr.toarray().astype(float)
        maxTree = Tcsr.toarray()
        #
        #
        #    Y = csr_matrix(Array1)
        #    Tcsr_depth = depth_first_tree(Y, 1, directed = False)
        #    Array2 = Tcsr_depth.toarray().astype(float)

        #    really = np.column_stack(((find(Array2))[0], (find(Array2))[1]))

        G = t2G(maxTree)
        parents = dfs(G, random.randint(0, len(G) - 1))

        #    row = Bags_K.iloc[:,[really[0][0], really[0][1]]].header(None)

        def check(X, i, j):
            count = 0
            if (X[0] == i and X[1] == j):
                count += 1
            return count

        prediction = np.zeros(len(valid_data))
        for i in range(parents.shape[1]):
            #            print(i)
            parent = parents[0, i]

            table = dataset.iloc[:, [parent, i]]

            CPD = np.zeros((2, 2))

            CPD[0][0] = (np.apply_along_axis(check, 1, table, 0, 0).sum() + 2)
            CPD[0][0] = CPD[0][0] / (len(dataset) + 4)
            CPD[0][1] = (np.apply_along_axis(check, 1, table, 0, 1).sum() + 2)
            CPD[0][1] = CPD[0][1] / (len(dataset) + 4)
            CPD[1][0] = (np.apply_along_axis(check, 1, table, 1, 0).sum() + 2)
            CPD[1][0] = CPD[1][0] / (len(dataset) + 4)
            CPD[1][1] = (np.apply_along_axis(check, 1, table, 1, 1).sum() + 2)
            CPD[1][1] = CPD[1][1] / (len(dataset) + 4)

            for j in range(len(valid_data)):
                if parent == -1:
                    if valid_data.iloc[j, i] == 1:
                        prediction[j] += np.log2(prob_x_1[i])
                    else:
                        prediction[j] += np.log2(prob_x_0[i])
                elif parent > -1:
                    if (valid_data.iloc[j, parent] == 0
                            and valid_data.iloc[j, i] == 0):
                        prediction[j] += np.log2(CPD[0][0] /
                                                 (prob_x_0[parent]))
                    elif (valid_data.iloc[j, parent] == 0
                          and valid_data.iloc[j, i] == 1):
                        prediction[j] += np.log2(CPD[0][1] /
                                                 (prob_x_0[parent]))
                    elif (valid_data.iloc[j, parent] == 1
                          and valid_data.iloc[j, i] == 0):
                        prediction[j] += np.log2(CPD[1][0] /
                                                 (prob_x_1[parent]))
                    elif (valid_data.iloc[j, parent] == 1
                          and valid_data.iloc[j, i] == 1):
                        prediction[j] += np.log2(CPD[1][1] /
                                                 (prob_x_1[parent]))

        Predictions[k] = Probabilities[k] * (prediction.sum() /
                                             len(valid_data))
    return Predictions.sum()
예제 #21
0
def info_var(z, zh):
    """Compute variation of information based on M. Meila (2007)."""
    return entropy(z) + entropy(zh) - 2 * mutual_info_score(z, zh)
예제 #22
0
def info_var(z, zh):
    """Compute variation of information based on M. Meila (2007)."""
    return entropy(z) + entropy(zh) - 2*mutual_info_score(z, zh)
예제 #23
0
print ipy.entropy(labels_pred)

## test comparison from scikit-learn
from sklearn.metrics.cluster import entropy
print entropy(labels_true)
print entropy(labels_pred)

print "## test mutual information"

print ipy.mutual_information(labels_true, labels_true)
print ipy.mutual_information(labels_pred, labels_pred)
print ipy.mutual_information(labels_true, labels_pred)

## test comparison from scikit-learn
from sklearn.metrics.cluster import mutual_info_score
print mutual_info_score(labels_true, labels_true)
print mutual_info_score(labels_pred, labels_pred)
print mutual_info_score(labels_true, labels_pred)

print "## test variation of information"
print ipy.information_variation(labels_true, labels_pred)

print "## test normalized mutual information"
print ipy.normalized_mutual_information([0, 0, 0, 0], [0, 1, 2, 3])
print ipy.normalized_mutual_information([0, 0, 1, 1], [1, 1, 0, 0])
print ipy.normalized_mutual_information([0, 0, 1, 1], [0, 0, 1, 1])

## test comparison from scikit-learn
from sklearn.metrics.cluster import normalized_mutual_info_score
print normalized_mutual_info_score([0, 0, 0, 0], [0, 1, 2, 3])
print normalized_mutual_info_score([0, 0, 1, 1], [1, 1, 0, 0])
예제 #24
0
# DBSCAN
print("DBSCAN evaluation: ",adjusted_mutual_info_score(digits.target, labels_dbscan))

# AgglomerativeClustering
print("AgglomerativeClustering evaluation: ",adjusted_mutual_info_score(digits.target, labels_Agg))


# <a id='2.7.2'></a>
# #### 2.7.2 Thực hiện đáng giá theo mutual_info_score

# In[139]:


# KMeans
print("KMeans evaluation: ",mutual_info_score(digits.target, labels))

# Spectral cluster
print("Spectral evaluation: ",mutual_info_score(digits.target, labels_spectral))

# DBSCAN
print("DBSCAN evaluation: ",mutual_info_score(digits.target, labels_dbscan))

# AgglomerativeClustering
print("AgglomerativeClustering evaluation: ",mutual_info_score(digits.target, labels_Agg))


# <a id='2.7.3'></a>
# #### 2.7.3 Thực hiện đáng giá theo homogeneity_completeness_v_measure
# - Giá trị trả về trong khoảng 0 >> 1
# - Càng về 1 thì độ khớp của True labels và cluster labels càng cao.
예제 #25
0
def MI_score(clusters1, clusters2):
    return mutual_info_score(clusters1, clusters2)
예제 #26
0
def NMI(X, Y):
    return mutual_info_score(X, Y)
예제 #27
0
def cluster_eval(config, net, test_dataloader, tf3, crop_transform,
                 preprocessing_pool, sobel):
    net.eval()

    # Computed predicted clusters and gets ground truth
    predicted_clusters, ground_truth_clusters = _clustering_get_data(
        config,
        net,
        test_dataloader,
        tf3,
        crop_transform,
        preprocessing_pool,
        sobel=sobel,
        using_IR=False,
        verbose=False)
    predicted_clusters = predicted_clusters[0]
    num_samples = predicted_clusters.shape[0]

    # Computes accuracy if the number of predicted clusters matches the number of ground truth ones
    accuracy = None
    if config.gt_k == config.output_k_B:
        match = _hungarian_match(predicted_clusters, ground_truth_clusters,
                                 config.gt_k, config.output_k_B)

        found = torch.zeros(config.gt_k)
        reordered_preds = torch.zeros(num_samples,
                                      dtype=predicted_clusters.dtype).cuda()

        for pred_i, target_i in match:
            # reordered_preds[flat_predss_all[i] == pred_i] = target_i
            reordered_preds[torch.eq(
                predicted_clusters, int(pred_i))] = torch.from_numpy(
                    np.array(target_i)).cuda().int().item()
            found[pred_i] = 1
        assert (found.sum() == config.gt_k)  # each output_k must get mapped

        accuracy = int((reordered_preds
                        == ground_truth_clusters).sum()) / float(num_samples)

    predicted_clusters = predicted_clusters.cpu().numpy()
    ground_truth_clusters = ground_truth_clusters.cpu().numpy()

    confusion_matrix = compute_cluster_confusion_matrix(
        predicted_clusters, ground_truth_clusters, config.output_k_B,
        config.gt_k)

    # Computes entropies
    _, predicted_clusters_distribution = np.unique(predicted_clusters,
                                                   return_counts=True)
    predicted_clusters_entropy = scipy.stats.entropy(
        predicted_clusters_distribution)

    _, ground_truth_clusters_distribution = np.unique(ground_truth_clusters,
                                                      return_counts=True)
    ground_truth_clusters_entropy = scipy.stats.entropy(
        ground_truth_clusters_distribution)

    # Computes information scores
    mutual_information = mutual_info_score(predicted_clusters,
                                           ground_truth_clusters)
    conditional_entropy = -(mutual_information - ground_truth_clusters_entropy)
    nmi = normalized_mutual_info_score(predicted_clusters,
                                       ground_truth_clusters)

    net.train()

    return nmi, mutual_information, conditional_entropy, ground_truth_clusters_entropy, predicted_clusters_entropy, accuracy, confusion_matrix
예제 #28
0
    def compute_scores(self, x):

        self.cluster_labels = np.ndarray((x.shape[0], ))

        for i in range(0, x.shape[0], self.batch_size):
            predictions = self.kmeans.predict(x[i:(i + self.batch_size)])
            self.cluster_labels[i:(i + self.batch_size)] = predictions

        if (i + self.batch_size) > x.shape[0]:
            predictions = self.kmeans.predict(x[i:x.shape[0]])
            self.cluster_labels[i:x.shape[0]] = predictions

        confusion_matrix = cscores.contingency_matrix(self.labels_true,
                                                      self.labels_pred)
        purity_score = np.sum(np.amax(confusion_matrix,
                                      axis=0)) / np.sum(confusion_matrix)
        homogeneity_score, completeness_score, v_measure_score = cscores.homogeneity_completeness_v_measure(
            self.labels_true, self.labels_pred)

        scores = [
            #['calinski_harabasz_score', 'internal', cscores.calinski_harabasz_score(x, self.cluster_labels)],
            [
                'davies_bouldin_score', 'internal',
                metrics.davies_bouldin_score(x, self.cluster_labels)
            ],
            [
                'silhouette_score', 'internal',
                metrics.silhouette_score(x, self.cluster_labels)
            ],
            #['silhouette_samples', 'internal', cscores.silhouette_samples(x, self.cluster_labels)],
            ['purity_score', 'external', purity_score],
            [
                'adjusted_rand_score', 'external',
                cscores.adjusted_rand_score(self.labels_true, self.labels_pred)
            ],
            ['completeness_score', 'external', completeness_score],
            [
                'fowlkes_mallows_score', 'external',
                cscores.fowlkes_mallows_score(self.labels_true,
                                              self.labels_pred)
            ],
            ['homogeneity_score', 'external', homogeneity_score],
            [
                'adjusted_mutual_info_score', 'external',
                cscores.adjusted_mutual_info_score(self.labels_true,
                                                   self.labels_pred)
            ],
            [
                'mutual_info_score', 'external',
                cscores.mutual_info_score(self.labels_true, self.labels_pred)
            ],
            [
                'normalized_mutual_info_score', 'external',
                cscores.normalized_mutual_info_score(self.labels_true,
                                                     self.labels_pred)
            ],
            ['v_measure_score', 'external', v_measure_score]
        ]

        scores = pd.DataFrame(scores, columns=['name', 'type', 'score'])
        scores.to_csv(files.small_images_classes_kmeans_scores, index=False)
예제 #29
0
    file = sys.argv[1]

    dataset = pd.read_csv(file + ".ts.data", header=None)
    test_data = pd.read_csv(file + ".test.data", header=None)

    prob_x_1 = (dataset[dataset == 1].count(axis=0) + 2) / (len(dataset) + 4)
    prob_x_0 = 1 - prob_x_1

    M_info = np.zeros((len(dataset.columns), len(dataset.columns)))

    from sklearn.metrics.cluster import mutual_info_score
    for i in dataset.columns:
        print(i)
        for j in dataset.columns:

            M_info[i][j] = mutual_info_score(dataset[i].values,
                                             dataset[j].values)

    from scipy.sparse import csr_matrix, find
    from scipy.sparse.csgraph import minimum_spanning_tree, depth_first_tree

    X = csr_matrix(M_info)
    Tcsr = -minimum_spanning_tree(-X)
    print(Tcsr)
    Array1 = Tcsr.toarray().astype(float)

    #Y = csr_matrix(A)
    Tcsr_depth = depth_first_tree(Array1, 1, directed=False)
    print(Tcsr_depth)
    Array2 = Tcsr_depth.toarray().astype(float)

    really = np.column_stack(((find(Array2))[0], (find(Array2))[1]))
def sklearn_measures(U, V):
    #     http://scikit-learn.org/stable/modules/classes.html#clustering-metrics
    import sklearn.metrics.cluster as sym
    U_labels = np.nonzero(U)[1]
    V_labels = np.nonzero(V)[1]
    print U_labels, V_labels
#     V2_labels = np.nonzero(V2)[1]
    print 'entro(U)=',sym.entropy(U_labels),'entro(V)=',sym.entropy(V_labels), 'entro(U,V)=',sym.mutual_info_score(U_labels, V_labels)
    res = [ ['ari', 'nmi', 'ami', 'vm' ], \
            [ sym.adjusted_rand_score(U_labels, V_labels),\
              sym.normalized_mutual_info_score(U_labels, V_labels),\
              sym.adjusted_mutual_info_score(U_labels, V_labels),\
              sym.v_measure_score(U_labels, V_labels)]]
    print res
    return res