Пример #1
0
    def test_emi_matlab(self):
        """Compare EMI values with reference MATLAB code

        http://www.mathworks.com/matlabcentral/fileexchange/33144-the-adjusted-mutual-information
        """

        ltrue = "11 11 11 11 11 11 11 10 10 10 10 13 13 13 13 13 13 13 13 13 12 \
        12 12 12 12 15 15 15 15 15 15 15 14 14 14 14 14 17 17 17 17 16 16 16 16 \
        16 16 19 19 19 19 19 19 19 18 18 18 18 18 18 18 20 20 20 20 20 20 1 1 1 \
        1 3 3 2 2 2 5 5 5 4 4 4 4 7 7 7 7 7 7 7 7 7 6 6 6 9 9 9 8 8".split()

        lpred = "1 19 19 13 2 20 20 8 12 5 17 10 10 13 15 20 20 6 9 8 9 10 15 \
        14 8 11 11 10 13 17 19 5 9 1 2 20 15 19 19 12 14 1 18 18 3 2 5 8 8 7 17 \
        17 17 16 11 11 14 17 16 6 8 13 17 1 3 7 9 9 1 5 18 13 17 13 12 20 11 4 \
        14 19 15 13 5 13 12 16 4 4 7 6 6 8 2 16 16 18 3 7 1 10".split()

        cm = ClusteringMetrics.from_labels(ltrue, lpred)
        ami = cm.adjusted_mutual_info()

        self.assertAlmostEqual(0.0352424389209073, ami, 12)

        rmarg = np.asarray(cm.row_totals.values(), dtype=np.int64)
        cmarg = np.asarray(cm.col_totals.values(), dtype=np.int64)

        emi1 = emi_fortran(rmarg, cmarg)
        emi2 = emi_cython(rmarg, cmarg)

        self.assertAlmostEqual(emi1, emi2, 10)
Пример #2
0
def test_RxC_metrics():
    """Alternative implementations should coincide for RxC matrices
    """
    for _ in xrange(100):
        ltrue = np.random.randint(low=0, high=5, size=(20,))
        lpred = np.random.randint(low=0, high=5, size=(20,))
        cm = ClusteringMetrics.from_labels(ltrue, lpred)

        # homogeneity, completeness, V-measure
        expected_v = cm.vi_similarity_m3()
        expected_hcv = sklearn_hcv(ltrue, lpred)
        actual_hcv = cm.entropy_scores()
        assert_array_almost_equal(actual_hcv, expected_hcv)
        assert_array_almost_equal(actual_hcv[2], expected_v)

        # mutual information score
        expected_mi = sklearn_mi(ltrue, lpred)
        actual_mi = mutual_info_score(ltrue, lpred)
        assert_array_almost_equal(actual_mi, expected_mi)

        # adjusted mutual information
        expected_ami = sklearn_ami(ltrue, lpred)
        actual_ami = adjusted_mutual_info_score(ltrue, lpred)
        assert_array_almost_equal(actual_ami, expected_ami)

        # adjusted rand index
        expected_ari = sklearn_ari(ltrue, lpred)
        actual_ari = adjusted_rand_score(ltrue, lpred)
        assert_array_almost_equal(actual_ari, expected_ari)
Пример #3
0
def test_RxC_general():
    """General conteingency-table mathods
    """
    for _ in xrange(100):
        size = np.random.randint(4, 100)
        a = np.random.randint(low=0, high=np.random.randint(low=2, high=100),
                              size=(size,))
        b = np.random.randint(low=0, high=np.random.randint(low=2, high=100),
                              size=(size,))
        cm = ClusteringMetrics.from_labels(a, b)

        assert_almost_equal(
            cm.assignment_score(model=None),
            assignment_score_slow(cm, rpad=False, cpad=False))

        assert_almost_equal(
            cm.assignment_score(model=None),
            assignment_score_slow(cm, rpad=True, cpad=True))

        for model in ['m1', 'm2r', 'm2c', 'm3']:

            assert_almost_equal(
                cm.grand_total,
                sum(cm.expected(model=model).itervalues()))

            assert_almost_equal(
                cm.assignment_score(model=model),
                cm.adjust_to_null(cm.assignment_score, model=model)[0])

            assert_almost_equal(
                cm.split_join_similarity(model=model),
                cm.adjust_to_null(cm.split_join_similarity, model=model)[0])
Пример #4
0
def add_incidence_metrics(args, clusters, pairs):
    """Add metrics based on incidence matrix of classes and clusters
    """
    args_metrics = args.metrics
    if set(utils.INCIDENCE_METRICS) & set(args_metrics):

        from lsh_hdc.metrics import ClusteringMetrics
        labels = clusters_to_labels(
            clusters,
            double_negs=bool(args.double_negs),
            join_negs=bool(args.join_negs)
        )
        cm = ClusteringMetrics.from_labels(*labels)

        pairwise_metrics = set(utils.PAIRWISE_METRICS) & set(args_metrics)
        append_scores(cm, pairs, pairwise_metrics)

        contingency_metrics = set(utils.CONTINGENCY_METRICS) & set(args_metrics)
        append_scores(cm, pairs, contingency_metrics)
Пример #5
0
def test_adjusted_mutual_info_score():
    # Compute the Adjusted Mutual Information and test against known values
    labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3])
    labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2])

    # Mutual information
    mi_1 = mutual_info_score(labels_a, labels_b)
    assert_almost_equal(mi_1, 0.41022, 5)
    mi_2 = mutual_info_score(labels_b, labels_a)
    assert_almost_equal(mi_2, 0.41022, 5)

    # Expected mutual information
    cm = ClusteringMetrics.from_labels(labels_a, labels_b)
    row_totals = np.fromiter(cm.iter_row_totals(), dtype=np.int64)
    col_totals = np.fromiter(cm.iter_col_totals(), dtype=np.int64)
    emi_1a = emi_cython(row_totals, col_totals) / cm.grand_total
    emi_1b = emi_fortran(row_totals, col_totals) / cm.grand_total
    assert_almost_equal(emi_1a, 0.15042, 5)
    assert_almost_equal(emi_1b, 0.15042, 5)
    emi_2a = emi_cython(col_totals, row_totals) / cm.grand_total
    emi_2b = emi_fortran(col_totals, row_totals) / cm.grand_total
    assert_almost_equal(emi_2a, 0.15042, 5)
    assert_almost_equal(emi_2b, 0.15042, 5)

    # Adjusted mutual information (1)
    ami_1 = adjusted_mutual_info_score(labels_a, labels_b)
    assert_almost_equal(ami_1, 0.27502, 5)
    ami_2 = adjusted_mutual_info_score(labels_a, labels_b)
    assert_almost_equal(ami_2, 0.27502, 5)

    # Adjusted mutual information (2)
    ami_1 = adjusted_mutual_info_score([1, 1, 2, 2], [2, 2, 3, 3])
    assert_equal(ami_1, 1.0)
    ami_2 = adjusted_mutual_info_score([2, 2, 3, 3], [1, 1, 2, 2])
    assert_equal(ami_2, 1.0)

    # Test AMI with a very large array
    a110 = np.array([list(labels_a) * 110]).flatten()
    b110 = np.array([list(labels_b) * 110]).flatten()
    ami = adjusted_mutual_info_score(a110, b110)
    assert_almost_equal(ami, 0.37, 2)  # not accurate to more than 2 places
Пример #6
0
def test_IR_example():
    """Test example from IR book by Manning et al.

    The example gives 3 clusters and 17 points total. It is described on
    http://nlp.stanford.edu/IR-book/html/htmledition/evaluation-of-clustering-1.html
    """
    ltrue = (0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2)
    lpred = (0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 2, 2, 1, 2, 2, 2)
    cm = ClusteringMetrics.from_labels(ltrue, lpred)

    # test perfect variants
    rd = cm.row_diag()
    cd = cm.col_diag()
    assert_almost_equal(rd.assignment_score(model='m3'), 1.0, 6)
    assert_almost_equal(cd.assignment_score(model='m3'), 1.0, 6)
    assert_almost_equal(cd.assignment_score(model='m3', discrete=True),   1.0, 6)
    assert_almost_equal(rd.assignment_score(model='m3'), 1.0, 6)
    assert_almost_equal(rd.assignment_score(model='m3', discrete=True),   1.0, 6)

    # test that no redraws happen by default
    assert_almost_equal(cm.assignment_score(model='m3'),
                        cm.assignment_score(model='m3'), 6)

    ex = cm.expected(discrete=False)
    assert_almost_equal(ex.assignment_score(model='m3'), 0.0, 6)

    # test that H1 results in greater score than H0
    ex = cm.expected(discrete=True)
    assert_greater(
        cm.assignment_score(model='m3'),
        ex.assignment_score(model='m3'))

    # test entropy metrics
    h, c, v = cm.entropy_scores()
    assert_almost_equal(h, 0.371468, 6)
    assert_almost_equal(c, 0.357908, 6)
    assert_almost_equal(v, 0.364562, 6)

    assert_almost_equal(cm.vi_similarity(model=None),    0.517754, 6)
    assert_almost_equal(cm.vi_similarity(model='m1'),    0.378167, 6)
    assert_almost_equal(cm.vi_similarity(model='m2r'),   0.365605, 6)
    assert_almost_equal(cm.vi_similarity(model='m2c'),   0.377165, 6)
    assert_almost_equal(cm.vi_similarity(model='m3'),    0.364562, 6)

    assert_almost_equal(cm.mirkin_match_coeff(),         0.695502, 6)
    assert_almost_equal(cm.rand_index(),                 0.676471, 6)
    assert_almost_equal(cm.fowlkes_mallows(),            0.476731, 6)
    assert_almost_equal(cm.assignment_score(model=None), 0.705882, 6)
    assert_almost_equal(cm.assignment_score(model='m3'), 0.554974, 6)

    assert_almost_equal(cm.chisq_score(),          11.900000, 6)
    assert_almost_equal(cm.g_score(),              13.325845, 6)

    # test metrics that are based on pairwise co-association matrix
    conf = cm.pairwise

    assert_almost_equal(conf.chisq_score(),         8.063241, 6)
    assert_almost_equal(conf.g_score(),             7.804221, 6)

    assert_almost_equal(conf.jaccard_coeff(),       0.312500, 6)
    assert_almost_equal(conf.ochiai_coeff(),        0.476731, 6)
    assert_almost_equal(conf.dice_coeff(),          0.476190, 6)
    assert_almost_equal(conf.sokal_sneath_coeff(),  0.185185, 6)

    assert_almost_equal(conf.kappa(),               0.242915, 6)
    assert_almost_equal(conf.accuracy(),            0.676471, 6)
    assert_almost_equal(conf.precision(),           0.500000, 6)
    assert_almost_equal(conf.recall(),              0.454545, 6)

    exp_tw = _talburt_wang_index(ltrue, lpred)
    act_tw = cm.talburt_wang_index()
    assert_almost_equal(exp_tw, act_tw, 6)
Пример #7
0
if os.path.exists(PATH):
    print "Loading data from %s" % PATH
    with open(PATH, 'r') as fh:
        ltrue, lpred = pickle.load(fh)
else:
    shape = (ARGS.num_samples,)
    ltrue = np.random.randint(low=0, high=ARGS.max_classes, size=shape)
    lpred = np.random.randint(low=0, high=ARGS.max_clusters, size=shape)
    print "Saving generated data to %s" % PATH
    with open(PATH, 'w') as fh:
        pickle.dump((ltrue, lpred), fh, protocol=pickle.HIGHEST_PROTOCOL)


if ARGS.implementation == 'oo':
    from lsh_hdc.metrics import ClusteringMetrics
    cm = ClusteringMetrics.from_labels(ltrue, lpred)
    method = getattr(cm, METHODS[ARGS.method][1])
    line = "method()"
elif ARGS.implementation == 'sklearn':
    import sklearn.metrics.cluster as module
    method = getattr(module, METHODS[ARGS.method][0])
    line = "method(ltrue, lpred)"
elif ARGS.implementation == 'proposed':
    import lsh_hdc.metrics as module
    method = getattr(module, METHODS[ARGS.method][0])
    line = "method(ltrue, lpred)"
else:
    raise argparse.ArgumentError('Unknown value for --implementation')


print "Sanity check:"
Пример #8
0
 def matrix_from_labels(*args):
     ltrue, lpred = args
     return ClusteringMetrics.from_labels(ltrue, lpred)