Пример #1
0
def test_mt_metrics():
    """Table 1 in Vilain et al. (1995)
    """

    # row 1
    p1 = ["A B C D".split()]
    p2 = ["A B".split(), "C D".split()]
    cm = ClusteringMetrics.from_partitions(p1, p2)
    assert_array_almost_equal(cm.muc_scores()[:2], [1.0, 0.6667], 4)

    # row 2
    p1 = ["A B".split(), "C D".split()]
    p2 = ["A B C D".split()]
    cm = ClusteringMetrics.from_partitions(p1, p2)
    assert_array_almost_equal(cm.muc_scores()[:2], [0.6667, 1.0], 4)

    # row 3
    p1 = ["A B C D".split()]
    p2 = ["A B C D".split()]
    cm = ClusteringMetrics.from_partitions(p1, p2)
    assert_array_almost_equal(cm.muc_scores()[:2], [1.0, 1.0], 4)

    # row 4 is exactly the same as row 1

    # row 5
    p1 = ["A B C".split()]
    p2 = ["A C".split(), "B"]
    cm = ClusteringMetrics.from_partitions(p1, p2)
    assert_array_almost_equal(cm.muc_scores()[:2], [1.0, 0.5], 4)
Пример #2
0
def test_split_join():
    """test split-join and related metrics

    Example given in
    http://stats.stackexchange.com/a/25001/37267

    For two different clustering pairs below, one can be obtained from the other
    by moving only two points, {9, 10} for the first pair, and {11, 12} for the
    second pair. The split-join distance for the two pairs is thus the same.

    Mirkin and VI distance is larger for the first pair (C1 and C2). This is not
    a fault of these measures as the clusterings in C3 and C4 do appear to
    capture more information than in the case of C1 and C2, and so their
    similarities should be greater.
    """

    C1 = [{1, 2, 3, 4, 5, 6, 7, 8}, {9, 10, 11, 12, 13, 14, 15, 16}]
    C2 = [{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {11, 12, 13, 14, 15, 16}]
    cm = ClusteringMetrics.from_partitions(C1, C2)
    assert_equal(cm.mirkin_mismatch_coeff(normalize=False), 56)
    assert_almost_equal(cm.vi_distance(normalize=False), 0.594, 3)
    assert_equal(cm.split_join_distance(normalize=False), 4)

    C3 = [{1, 2, 3, 4}, {5, 6, 7, 8, 9, 10}, {11, 12, 13, 14, 15, 16}]
    C4 = [{1, 2, 3, 4}, {5, 6, 7, 8, 9, 10, 11, 12}, {13, 14, 15, 16}]
    cm = ClusteringMetrics.from_partitions(C3, C4)
    assert_equal(cm.mirkin_mismatch_coeff(normalize=False), 40)
    assert_almost_equal(cm.vi_distance(normalize=False), 0.520, 3)
    assert_equal(cm.split_join_distance(normalize=False), 4)
Пример #3
0
def test_bc_metrics():
    """Examples 1 and 2, listing in Figure 9, Bagga & Baldwin (1998)
    """
    p1 = ["1 2 3 4 5".split(), "6 7".split(), "8 9 A B C".split()]

    p2 = ["1 2 3 4 5".split(), "6 7 8 9 A B C".split()]
    cm = ClusteringMetrics.from_partitions(p1, p2)
    assert_array_almost_equal(cm.bc_metrics()[:2], [0.76, 1.0], 2)
    assert_array_almost_equal(cm.muc_scores()[:2], [0.9, 1.0], 4)

    p2 = ["1 2 3 4 5 8 9 A B C".split(), "6 7".split()]
    cm = ClusteringMetrics.from_partitions(p1, p2)
    assert_array_almost_equal(cm.bc_metrics()[:2], [0.58, 1.0], 2)
    assert_array_almost_equal(cm.muc_scores()[:2], [0.9, 1.0], 4)
Пример #4
0
def test_perfect():
    p1 = [['A', 'B', 'C']]
    p2 = [['A', 'B', 'C']]
    cm = ClusteringMetrics.from_partitions(p1, p2)
    assert_almost_equal(cm.assignment_score(), 1.0, 4)
    assert_almost_equal(cm.vi_similarity(), 1.0, 4)
    assert_almost_equal(cm.split_join_similarity(), 1.0, 4)
    assert_almost_equal(cm.talburt_wang_index(), 1.0, 4)
    assert_array_almost_equal(cm.entropy_scores(), (1.0,) * 3, 4)
    assert_array_almost_equal(cm.bc_metrics(), (1.0,) * 3, 4)
    assert_array_almost_equal(cm.muc_scores(), (1.0,) * 3, 4)