예제 #1
0
    def test_emi_matlab(self):
        """Compare EMI values with reference MATLAB code

        http://www.mathworks.com/matlabcentral/fileexchange/33144-the-adjusted-mutual-information
        """

        ltrue = "11 11 11 11 11 11 11 10 10 10 10 13 13 13 13 13 13 13 13 13 12 \
        12 12 12 12 15 15 15 15 15 15 15 14 14 14 14 14 17 17 17 17 16 16 16 16 \
        16 16 19 19 19 19 19 19 19 18 18 18 18 18 18 18 20 20 20 20 20 20 1 1 1 \
        1 3 3 2 2 2 5 5 5 4 4 4 4 7 7 7 7 7 7 7 7 7 6 6 6 9 9 9 8 8".split()

        lpred = "1 19 19 13 2 20 20 8 12 5 17 10 10 13 15 20 20 6 9 8 9 10 15 \
        14 8 11 11 10 13 17 19 5 9 1 2 20 15 19 19 12 14 1 18 18 3 2 5 8 8 7 17 \
        17 17 16 11 11 14 17 16 6 8 13 17 1 3 7 9 9 1 5 18 13 17 13 12 20 11 4 \
        14 19 15 13 5 13 12 16 4 4 7 6 6 8 2 16 16 18 3 7 1 10".split()

        cm = ClusteringMetrics.from_labels(ltrue, lpred)
        ami = cm.adjusted_mutual_info()

        self.assertAlmostEqual(0.0352424389209073, ami, 12)

        rmarg = np.asarray(cm.row_totals.values(), dtype=np.int64)
        cmarg = np.asarray(cm.col_totals.values(), dtype=np.int64)

        emi1 = emi_fortran(rmarg, cmarg)
        emi2 = emi_cython(rmarg, cmarg)

        self.assertAlmostEqual(emi1, emi2, 10)
예제 #2
0
def test_adjusted_mutual_info_score():
    # Compute the Adjusted Mutual Information and test against known values
    labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3])
    labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2])

    # Mutual information
    mi_1 = mutual_info_score(labels_a, labels_b)
    assert_almost_equal(mi_1, 0.41022, 5)
    mi_2 = mutual_info_score(labels_b, labels_a)
    assert_almost_equal(mi_2, 0.41022, 5)

    # Expected mutual information
    cm = ClusteringMetrics.from_labels(labels_a, labels_b)
    row_totals = np.fromiter(cm.iter_row_totals(), dtype=np.int64)
    col_totals = np.fromiter(cm.iter_col_totals(), dtype=np.int64)
    emi_1a = emi_cython(row_totals, col_totals) / cm.grand_total
    emi_1b = emi_fortran(row_totals, col_totals) / cm.grand_total
    assert_almost_equal(emi_1a, 0.15042, 5)
    assert_almost_equal(emi_1b, 0.15042, 5)
    emi_2a = emi_cython(col_totals, row_totals) / cm.grand_total
    emi_2b = emi_fortran(col_totals, row_totals) / cm.grand_total
    assert_almost_equal(emi_2a, 0.15042, 5)
    assert_almost_equal(emi_2b, 0.15042, 5)

    # Adjusted mutual information (1)
    ami_1 = adjusted_mutual_info_score(labels_a, labels_b)
    assert_almost_equal(ami_1, 0.27502, 5)
    ami_2 = adjusted_mutual_info_score(labels_a, labels_b)
    assert_almost_equal(ami_2, 0.27502, 5)

    # Adjusted mutual information (2)
    ami_1 = adjusted_mutual_info_score([1, 1, 2, 2], [2, 2, 3, 3])
    assert_equal(ami_1, 1.0)
    ami_2 = adjusted_mutual_info_score([2, 2, 3, 3], [1, 1, 2, 2])
    assert_equal(ami_2, 1.0)

    # Test AMI with a very large array
    a110 = np.array([list(labels_a) * 110]).flatten()
    b110 = np.array([list(labels_b) * 110]).flatten()
    ami = adjusted_mutual_info_score(a110, b110)
    assert_almost_equal(ami, 0.37, 2)  # not accurate to more than 2 places