示例#1
0
    def test_whitespace_tokenizer(self):
        """Test abydos.tokenizer.WhitespaceTokenizer."""
        self.assertEqual(sorted(WhitespaceTokenizer().tokenize('').get_list()),
                         [])
        self.assertEqual(
            sorted(WhitespaceTokenizer().tokenize('a').get_list()), ['a'])

        self.assertEqual(
            sorted(WhitespaceTokenizer().tokenize('NELSON').get_list()),
            sorted(['NELSON']),
        )
        self.assertEqual(
            sorted(WhitespaceTokenizer().tokenize('NEILSEN').get_list()),
            sorted(['NEILSEN']),
        )

        tweet = 'Good to be home for a night. Even better to see the\
        @chicagobulls start the season off right! #SeeRed'

        self.assertEqual(
            sorted(WhitespaceTokenizer().tokenize(tweet).get_list()),
            sorted([
                'Good',
                'to',
                'be',
                'home',
                'for',
                'a',
                'night.',
                'Even',
                'better',
                'to',
                'see',
                'the',
                '@chicagobulls',
                'start',
                'the',
                'season',
                'off',
                'right!',
                '#SeeRed',
            ]),
        )
示例#2
0
    def test_soft_jaccard_sim(self):
        """Test abydos.distance.Jaccard.sim (soft)."""
        # Base cases
        self.assertEqual(self.cmp_j_soft.sim('', ''), 1.0)
        self.assertEqual(self.cmp_j_soft.sim('a', ''), 0.0)
        self.assertEqual(self.cmp_j_soft.sim('', 'a'), 0.0)
        self.assertEqual(self.cmp_j_soft.sim('abc', ''), 0.0)
        self.assertEqual(self.cmp_j_soft.sim('', 'abc'), 0.0)
        self.assertEqual(self.cmp_j_soft.sim('abc', 'abc'), 1.0)
        self.assertAlmostEqual(self.cmp_j_soft.sim('abcd', 'efgh'), 0.11111111)

        self.assertAlmostEqual(self.cmp_j_soft.sim('Nigel', 'Niall'), 0.5)
        self.assertAlmostEqual(self.cmp_j_soft.sim('Niall', 'Nigel'), 0.5)
        self.assertAlmostEqual(self.cmp_j_soft.sim('Colin', 'Coiln'), 0.6)
        self.assertAlmostEqual(self.cmp_j_soft.sim('Coiln', 'Colin'), 0.6)
        self.assertAlmostEqual(self.cmp_j_soft.sim('ATCAACGAGT', 'AACGATTAG'),
                               0.68)

        self.assertAlmostEqual(
            Jaccard(intersection_type='soft',
                    tokenizer=WhitespaceTokenizer()).sim(
                        'junior system analyst', 'systems analyst'),
            0.6190476190476191,
        )
        self.assertAlmostEqual(
            Jaccard(intersection_type='soft',
                    tokenizer=WhitespaceTokenizer()).sim(
                        'systems analyst', 'junior system analyst'),
            0.6190476190476191,
        )

        with self.assertRaises(TypeError):
            Jaccard(
                intersection_type='soft',
                metric=JaroWinkler(),
                tokenizer=WhitespaceTokenizer(),
            ).sim('junior system analyst', 'systems analyst')
示例#3
0
class QGramTestCases(unittest.TestCase):
    """Test QGram functions.

    abydos.distance.QGram
    """

    cmp = QGram()
    cmp_q1 = QGram(qval=1)
    cmp_ws = QGram(tokenizer=WhitespaceTokenizer())

    def test_q_gram_dist(self):
        """Test abydos.distance.QGram.dist."""
        # Base cases
        self.assertEqual(self.cmp.dist('', ''), 0.0)
        self.assertEqual(self.cmp.dist('a', ''), 0.0)
        self.assertEqual(self.cmp.dist('', 'a'), 0.0)
        self.assertEqual(self.cmp.dist('abc', ''), 1.0)
        self.assertEqual(self.cmp.dist('', 'abc'), 1.0)
        self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0)
        self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0)

        self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.8571428571)
        self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.8571428571)
        self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.8571428571)
        self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.8571428571)
        self.assertAlmostEqual(self.cmp.dist('ATCAACGAGT', 'AACGATTAG'),
                               0.4545454545)

    def test_q_gram_sim(self):
        """Test abydos.distance.QGram.sim."""
        # Base cases
        self.assertEqual(self.cmp.sim('', ''), 1.0)
        self.assertEqual(self.cmp.sim('a', ''), 1.0)
        self.assertEqual(self.cmp.sim('', 'a'), 1.0)
        self.assertEqual(self.cmp.sim('abc', ''), 0.0)
        self.assertEqual(self.cmp.sim('', 'abc'), 0.0)
        self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0)
        self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0)

        self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.1428571429)
        self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.1428571429)
        self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.1428571429)
        self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.1428571429)
        self.assertAlmostEqual(self.cmp.sim('ATCAACGAGT', 'AACGATTAG'),
                               0.5454545455)

    def test_q_gram_dist_abs(self):
        """Test abydos.distance.QGram.dist_abs."""
        # Base cases
        self.assertEqual(self.cmp.dist_abs('', ''), 0)
        self.assertEqual(self.cmp.dist_abs('a', ''), 0)
        self.assertEqual(self.cmp.dist_abs('', 'a'), 0)
        self.assertEqual(self.cmp.dist_abs('abc', ''), 2)
        self.assertEqual(self.cmp.dist_abs('', 'abc'), 2)
        self.assertEqual(self.cmp.dist_abs('abc', 'abc'), 0)
        self.assertEqual(self.cmp.dist_abs('abcd', 'efgh'), 6)

        self.assertAlmostEqual(self.cmp.dist_abs('Nigel', 'Niall'), 6)
        self.assertAlmostEqual(self.cmp.dist_abs('Niall', 'Nigel'), 6)
        self.assertAlmostEqual(self.cmp.dist_abs('Colin', 'Coiln'), 6)
        self.assertAlmostEqual(self.cmp.dist_abs('Coiln', 'Colin'), 6)
        self.assertAlmostEqual(self.cmp.dist_abs('ATCAACGAGT', 'AACGATTAG'), 5)

        # Example from paper
        self.assertEqual(self.cmp.dist_abs('01000', '001111'), 5)

        # Coverage
        self.assertEqual(self.cmp_q1.dist_abs('01000', '001111'), 5)
        self.assertEqual(self.cmp_ws.dist_abs('a a b b c', 'a b b b c d'), 3)
class ChebyshevTestCases(unittest.TestCase):
    """Test Chebyshev functions.

    abydos.distance.Chebyshev
    """

    cmp = Chebyshev()
    cmp_q2 = Chebyshev(tokenizer=QGrams(2))
    cmp_ws = Chebyshev(tokenizer=WhitespaceTokenizer())

    def test_chebyshev_dist_abs(self):
        """Test abydos.distance.Chebyshev.dist_abs."""
        self.assertEqual(self.cmp.dist_abs('', ''), 0)
        self.assertEqual(self.cmp.dist_abs('nelson', ''), 1)
        self.assertEqual(self.cmp.dist_abs('', 'neilsen'), 1)
        self.assertEqual(self.cmp.dist_abs('nelson', 'neilsen'), 1)

        self.assertEqual(self.cmp_q2.dist_abs('', ''), 0)
        self.assertEqual(self.cmp_q2.dist_abs('nelson', ''), 1)
        self.assertEqual(self.cmp_q2.dist_abs('', 'neilsen'), 1)
        self.assertAlmostEqual(self.cmp_q2.dist_abs('nelson', 'neilsen'), 1)

        # supplied q-gram tests
        self.assertEqual(
            self.cmp.dist_abs(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            0,
        )
        self.assertEqual(
            self.cmp.dist_abs(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            1,
        )
        self.assertEqual(
            self.cmp.dist_abs(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            1,
        )
        self.assertAlmostEqual(
            self.cmp.dist_abs(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            1,
        )

        # non-q-gram tests
        self.assertEqual(self.cmp_ws.dist_abs('', ''), 0)
        self.assertEqual(self.cmp_ws.dist_abs('the quick', ''), 1)
        self.assertEqual(self.cmp_ws.dist_abs('', 'the quick'), 1)
        self.assertAlmostEqual(self.cmp_ws.dist_abs(NONQ_FROM, NONQ_TO), 1)
        self.assertAlmostEqual(self.cmp_ws.dist_abs(NONQ_TO, NONQ_FROM), 1)

    def test_chebyshev_dist(self):
        """Test abydos.distance.Chebyshev.dist."""
        self.assertRaises(NotImplementedError, self.cmp.dist)

    def test_chebyshev_sim(self):
        """Test abydos.distance.Chebyshev.sim."""
        self.assertRaises(NotImplementedError, self.cmp.sim)
示例#5
0
class MinkowskiTestCases(unittest.TestCase):
    """Test Minkowski functions.

    abydos.distance.Minkowski
    """

    cmp = Minkowski()
    cmp_q2 = Minkowski(tokenizer=QGrams(2))
    cmp_q1p0 = Minkowski(pval=0, tokenizer=QGrams(1))
    cmp_ws = Minkowski(tokenizer=WhitespaceTokenizer())

    def test_minkowski_dist_abs(self):
        """Test abydos.distance.Minkowski.dist_abs."""
        self.assertEqual(self.cmp.dist_abs('', ''), 0)
        self.assertEqual(self.cmp.dist_abs('nelson', ''), 7)
        self.assertEqual(self.cmp.dist_abs('', 'neilsen'), 8)
        self.assertEqual(self.cmp.dist_abs('nelson', 'neilsen'), 7)

        self.assertEqual(self.cmp_q2.dist_abs('', ''), 0)
        self.assertEqual(self.cmp_q2.dist_abs('nelson', ''), 7)
        self.assertEqual(self.cmp_q2.dist_abs('', 'neilsen'), 8)
        self.assertEqual(self.cmp_q2.dist_abs('nelson', 'neilsen'), 7)

        # supplied q-gram tests
        self.assertEqual(
            self.cmp.dist_abs(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            0,
        )
        self.assertEqual(
            self.cmp.dist_abs(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            7,
        )
        self.assertEqual(
            self.cmp.dist_abs(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            8,
        )
        self.assertEqual(
            self.cmp.dist_abs(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            7,
        )

        # non-q-gram tests
        self.assertEqual(self.cmp_ws.dist_abs('', ''), 0)
        self.assertEqual(self.cmp_ws.dist_abs('the quick', ''), 2)
        self.assertEqual(self.cmp_ws.dist_abs('', 'the quick'), 2)
        self.assertEqual(self.cmp_ws.dist_abs(NONQ_FROM, NONQ_TO), 8)
        self.assertEqual(self.cmp_ws.dist_abs(NONQ_TO, NONQ_FROM), 8)

        # test l_0 "norm"
        self.assertEqual(self.cmp_q1p0.dist_abs('', ''), 0)
        self.assertEqual(self.cmp_q1p0.dist_abs('a', ''), 1)
        self.assertEqual(self.cmp_q1p0.dist_abs('a', 'b'), 2)
        self.assertEqual(self.cmp_q1p0.dist_abs('ab', 'b'), 1)
        self.assertEqual(self.cmp_q1p0.dist_abs('aab', 'b'), 1)
        self.assertEqual(self.cmp_q1p0.dist_abs('', '', normalized=True), 0)
        self.assertEqual(self.cmp_q1p0.dist_abs('a', '', normalized=True), 1)
        self.assertEqual(self.cmp_q1p0.dist_abs('a', 'b', normalized=True), 1)
        self.assertEqual(self.cmp_q1p0.dist_abs('ab', 'b', normalized=True),
                         1 / 2)
        self.assertEqual(self.cmp_q1p0.dist_abs('aab', 'b', normalized=True),
                         1 / 2)
        self.assertEqual(self.cmp_q1p0.dist_abs('aaab', 'b', normalized=True),
                         1 / 2)
        self.assertEqual(self.cmp_q1p0.dist_abs('aaab', 'ab', normalized=True),
                         1 / 2)

        # test with alphabet
        self.assertEqual(
            Minkowski(tokenizer=QGrams(1), alphabet=26).dist_abs('ab', 'b'), 1)
        self.assertEqual(
            Minkowski(tokenizer=QGrams(1),
                      alphabet=26).dist_abs('ab', 'b', normalized=True),
            1 / 26,
        )
        self.assertEqual(
            Minkowski(tokenizer=QGrams(1),
                      alphabet='abcdefghijklmnopqrstuvwxyz').dist_abs(
                          'ab', 'b', normalized=True),
            1 / 26,
        )

        self.assertEqual(
            Minkowski(pval=float('inf')).dist_abs('nelsonian', 'neilsen'), 1.0)

        # Test wrapper
        self.assertAlmostEqual(minkowski('nelson', 'neilsen'), 7)

    def test_minkowski_sim(self):
        """Test abydos.distance.Minkowski.sim."""
        self.assertEqual(self.cmp.sim('', ''), 1)
        self.assertEqual(self.cmp.sim('nelson', ''), 0)
        self.assertEqual(self.cmp.sim('', 'neilsen'), 0)
        self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen'), 8 / 15)

        self.assertEqual(self.cmp_q2.sim('', ''), 1)
        self.assertEqual(self.cmp_q2.sim('nelson', ''), 0)
        self.assertEqual(self.cmp_q2.sim('', 'neilsen'), 0)
        self.assertAlmostEqual(self.cmp_q2.sim('nelson', 'neilsen'), 8 / 15)

        # supplied q-gram tests
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            1,
        )
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            0,
        )
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            0,
        )
        self.assertAlmostEqual(
            self.cmp.sim(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            8 / 15,
        )

        # non-q-gram tests
        self.assertEqual(self.cmp_ws.sim('', ''), 1)
        self.assertEqual(self.cmp_ws.sim('the quick', ''), 0)
        self.assertEqual(self.cmp_ws.sim('', 'the quick'), 0)
        self.assertAlmostEqual(self.cmp_ws.sim(NONQ_FROM, NONQ_TO), 1 / 2)
        self.assertAlmostEqual(self.cmp_ws.sim(NONQ_TO, NONQ_FROM), 1 / 2)

        # Test wrapper
        self.assertAlmostEqual(sim_minkowski('nelson', 'neilsen'), 8 / 15)

    def test_minkowski_dist(self):
        """Test abydos.distance.Minkowski.dist."""
        self.assertEqual(self.cmp.dist('', ''), 0)
        self.assertEqual(self.cmp.dist('nelson', ''), 1)
        self.assertEqual(self.cmp.dist('', 'neilsen'), 1)
        self.assertAlmostEqual(self.cmp.dist('nelson', 'neilsen'), 7 / 15)

        self.assertEqual(self.cmp_q2.dist('', ''), 0)
        self.assertEqual(self.cmp_q2.dist('nelson', ''), 1)
        self.assertEqual(self.cmp_q2.dist('', 'neilsen'), 1)
        self.assertAlmostEqual(self.cmp_q2.dist('nelson', 'neilsen'), 7 / 15)

        # supplied q-gram tests
        self.assertEqual(
            self.cmp.dist(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            0,
        )
        self.assertEqual(
            self.cmp.dist(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            1,
        )
        self.assertEqual(
            self.cmp.dist(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            1,
        )
        self.assertAlmostEqual(
            self.cmp.dist(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            7 / 15,
        )

        # non-q-gram tests
        self.assertEqual(self.cmp_ws.dist('', ''), 0)
        self.assertEqual(self.cmp_ws.dist('the quick', ''), 1)
        self.assertEqual(self.cmp_ws.dist('', 'the quick'), 1)
        self.assertAlmostEqual(self.cmp_ws.dist(NONQ_FROM, NONQ_TO), 1 / 2)
        self.assertAlmostEqual(self.cmp_ws.dist(NONQ_TO, NONQ_FROM), 1 / 2)

        # Test wrapper
        self.assertAlmostEqual(dist_minkowski('nelson', 'neilsen'), 7 / 15)
示例#6
0
class EuclideanTestCases(unittest.TestCase):
    """Test Euclidean functions.

    abydos.distance.Euclidean
    """

    cmp = Euclidean()
    cmp_q2 = Euclidean(tokenizer=QGrams(2))
    cmp_ws = Euclidean(tokenizer=WhitespaceTokenizer())

    def test_euclidean_dist_abs(self):
        """Test abydos.distance.Euclidean.dist_abs."""
        self.assertEqual(self.cmp.dist_abs('', ''), 0)
        self.assertEqual(self.cmp.dist_abs('nelson', ''), 7**0.5)
        self.assertEqual(self.cmp.dist_abs('', 'neilsen'), 8**0.5)
        self.assertAlmostEqual(self.cmp.dist_abs('nelson', 'neilsen'), 7**0.5)

        self.assertEqual(self.cmp_q2.dist_abs('', ''), 0)
        self.assertEqual(self.cmp_q2.dist_abs('nelson', ''), 7**0.5)
        self.assertEqual(self.cmp_q2.dist_abs('', 'neilsen'), 8**0.5)
        self.assertAlmostEqual(self.cmp_q2.dist_abs('nelson', 'neilsen'),
                               7**0.5)

        # supplied q-gram tests
        self.assertEqual(
            self.cmp.dist_abs(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            0,
        )
        self.assertEqual(
            self.cmp.dist_abs(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            7**0.5,
        )
        self.assertEqual(
            self.cmp.dist_abs(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            8**0.5,
        )
        self.assertAlmostEqual(
            self.cmp.dist_abs(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            7**0.5,
        )

        # non-q-gram tests
        self.assertEqual(self.cmp_ws.dist_abs('', ''), 0)
        self.assertEqual(self.cmp_ws.dist_abs('the quick', ''), 2**0.5)
        self.assertEqual(self.cmp_ws.dist_abs('', 'the quick'), 2**0.5)
        self.assertAlmostEqual(self.cmp_ws.dist_abs(NONQ_FROM, NONQ_TO),
                               8**0.5)
        self.assertAlmostEqual(self.cmp_ws.dist_abs(NONQ_TO, NONQ_FROM),
                               8**0.5)

        # Test wrapper
        self.assertAlmostEqual(euclidean('nelson', 'neilsen'), 7**0.5)

    def test_euclidean_sim(self):
        """Test abydos.distance.Euclidean.sim."""
        self.assertEqual(self.cmp.sim('', ''), 1)
        self.assertEqual(self.cmp.sim('nelson', ''), 0)
        self.assertEqual(self.cmp.sim('', 'neilsen'), 0)
        self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen'),
                               1 - 7**0.5 / 23**0.5)

        self.assertEqual(self.cmp_q2.sim('', ''), 1)
        self.assertEqual(self.cmp_q2.sim('nelson', ''), 0)
        self.assertEqual(self.cmp_q2.sim('', 'neilsen'), 0)
        self.assertAlmostEqual(self.cmp_q2.sim('nelson', 'neilsen'),
                               1 - 7**0.5 / 23**0.5)

        # supplied q-gram tests
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            1,
        )
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            0,
        )
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            0,
        )
        self.assertAlmostEqual(
            self.cmp.sim(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            1 - 7**0.5 / 23**0.5,
        )

        # non-q-gram tests
        self.assertEqual(self.cmp_ws.sim('', ''), 1)
        self.assertEqual(self.cmp_ws.sim('the quick', ''), 0)
        self.assertEqual(self.cmp_ws.sim('', 'the quick'), 0)
        self.assertAlmostEqual(self.cmp_ws.sim(NONQ_FROM, NONQ_TO),
                               1 - 8**0.5 / 24**0.5)
        self.assertAlmostEqual(self.cmp_ws.sim(NONQ_TO, NONQ_FROM),
                               1 - 8**0.5 / 24**0.5)

        # Test wrapper
        self.assertAlmostEqual(sim_euclidean('nelson', 'neilsen'),
                               1 - 7**0.5 / 23**0.5)

    def test_euclidean_dist(self):
        """Test abydos.distance.Euclidean.dist."""
        self.assertEqual(self.cmp.dist('', ''), 0)
        self.assertEqual(self.cmp.dist('nelson', ''), 1)
        self.assertEqual(self.cmp.dist('', 'neilsen'), 1)
        self.assertAlmostEqual(self.cmp.dist('nelson', 'neilsen'),
                               7**0.5 / 23**0.5)

        self.assertEqual(self.cmp_q2.dist('', ''), 0)
        self.assertEqual(self.cmp_q2.dist('nelson', ''), 1)
        self.assertEqual(self.cmp_q2.dist('', 'neilsen'), 1)
        self.assertAlmostEqual(self.cmp_q2.dist('nelson', 'neilsen'),
                               7**0.5 / 23**0.5)

        # supplied q-gram tests
        self.assertEqual(
            self.cmp.dist(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            0,
        )
        self.assertEqual(
            self.cmp.dist(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            1,
        )
        self.assertEqual(
            self.cmp.dist(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            1,
        )
        self.assertAlmostEqual(
            self.cmp.dist(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            7**0.5 / 23**0.5,
        )

        # non-q-gram tests
        self.assertEqual(self.cmp_ws.dist('', ''), 0)
        self.assertEqual(self.cmp_ws.dist('the quick', ''), 1)
        self.assertEqual(self.cmp_ws.dist('', 'the quick'), 1)
        self.assertAlmostEqual(self.cmp_ws.dist(NONQ_FROM, NONQ_TO),
                               8**0.5 / 24**0.5)
        self.assertAlmostEqual(self.cmp_ws.dist(NONQ_TO, NONQ_FROM),
                               8**0.5 / 24**0.5)

        # Test wrapper
        self.assertAlmostEqual(dist_euclidean('nelson', 'neilsen'),
                               7**0.5 / 23**0.5)
示例#7
0
class JaccardTestCases(unittest.TestCase):
    """Test Jaccard functions.

    abydos.distance.Jaccard
    """

    cmp = Jaccard()
    cmp_q2 = Jaccard(tokenizer=QGrams(2))
    cmp_ws = Jaccard(tokenizer=WhitespaceTokenizer())

    def test_jaccard_sim(self):
        """Test abydos.distance.Jaccard.sim."""
        self.assertEqual(self.cmp.sim('', ''), 1)
        self.assertEqual(self.cmp.sim('nelson', ''), 0)
        self.assertEqual(self.cmp.sim('', 'neilsen'), 0)
        self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen'), 4 / 11)

        self.assertEqual(self.cmp_q2.sim('', ''), 1)
        self.assertEqual(self.cmp_q2.sim('nelson', ''), 0)
        self.assertEqual(self.cmp_q2.sim('', 'neilsen'), 0)
        self.assertAlmostEqual(self.cmp_q2.sim('nelson', 'neilsen'), 4 / 11)

        # supplied q-gram tests
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            1,
        )
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            0,
        )
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            0,
        )
        self.assertAlmostEqual(
            self.cmp.sim(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            4 / 11,
        )

        # non-q-gram tests
        self.assertEqual(self.cmp_ws.sim('', ''), 1)
        self.assertEqual(self.cmp_ws.sim('the quick', ''), 0)
        self.assertEqual(self.cmp_ws.sim('', 'the quick'), 0)
        self.assertAlmostEqual(self.cmp_ws.sim(NONQ_FROM, NONQ_TO), 1 / 3)
        self.assertAlmostEqual(self.cmp_ws.sim(NONQ_TO, NONQ_FROM), 1 / 3)

        # Test wrapper
        self.assertAlmostEqual(sim_jaccard('nelson', 'neilsen'), 4 / 11)

    def test_jaccard_dist(self):
        """Test abydos.distance.Jaccard.dist."""
        self.assertEqual(self.cmp.dist('', ''), 0)
        self.assertEqual(self.cmp.dist('nelson', ''), 1)
        self.assertEqual(self.cmp.dist('', 'neilsen'), 1)
        self.assertAlmostEqual(self.cmp.dist('nelson', 'neilsen'), 7 / 11)

        self.assertEqual(self.cmp_q2.dist('', ''), 0)
        self.assertEqual(self.cmp_q2.dist('nelson', ''), 1)
        self.assertEqual(self.cmp_q2.dist('', 'neilsen'), 1)
        self.assertAlmostEqual(self.cmp_q2.dist('nelson', 'neilsen'), 7 / 11)

        # supplied q-gram tests
        self.assertEqual(
            self.cmp.dist(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            0,
        )
        self.assertEqual(
            self.cmp.dist(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            1,
        )
        self.assertEqual(
            self.cmp.dist(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            1,
        )
        self.assertAlmostEqual(
            self.cmp.dist(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            7 / 11,
        )

        # non-q-gram tests
        self.assertEqual(self.cmp_ws.dist('', ''), 0)
        self.assertEqual(self.cmp_ws.dist('the quick', ''), 1)
        self.assertEqual(self.cmp_ws.dist('', 'the quick'), 1)
        self.assertAlmostEqual(self.cmp_ws.dist(NONQ_FROM, NONQ_TO), 2 / 3)
        self.assertAlmostEqual(self.cmp_ws.dist(NONQ_TO, NONQ_FROM), 2 / 3)

        # Test wrapper
        self.assertAlmostEqual(dist_jaccard('nelson', 'neilsen'), 7 / 11)
示例#8
0
class TanimotoTestCases(unittest.TestCase):
    """Test Tanimoto functions.

    abydos.distance.Jaccard.tanimoto_coeff
    """

    cmp = Jaccard()
    cmp_q2 = Jaccard(tokenizer=QGrams(2))
    cmp_ws = Jaccard(tokenizer=WhitespaceTokenizer())

    def test_jaccard_tanimoto_coeff(self):
        """Test abydos.distance.Jaccard.tanimoto_coeff."""
        self.assertEqual(self.cmp.tanimoto_coeff('', ''), 0)
        self.assertEqual(self.cmp.tanimoto_coeff('nelson', ''), float('-inf'))
        self.assertEqual(self.cmp.tanimoto_coeff('', 'neilsen'), float('-inf'))
        self.assertAlmostEqual(self.cmp.tanimoto_coeff('nelson', 'neilsen'),
                               log2(4 / 11))

        self.assertEqual(self.cmp_q2.tanimoto_coeff('', ''), 0)
        self.assertEqual(self.cmp_q2.tanimoto_coeff('nelson', ''),
                         float('-inf'))
        self.assertEqual(self.cmp_q2.tanimoto_coeff('', 'neilsen'),
                         float('-inf'))
        self.assertAlmostEqual(
            self.cmp_q2.tanimoto_coeff('nelson', 'neilsen'),
            log2(4 / 11),
        )

        # supplied q-gram tests
        self.assertEqual(
            self.cmp.tanimoto_coeff(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            0,
        )
        self.assertEqual(
            self.cmp.tanimoto_coeff(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            float('-inf'),
        )
        self.assertEqual(
            self.cmp.tanimoto_coeff(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            float('-inf'),
        )
        self.assertAlmostEqual(
            self.cmp.tanimoto_coeff(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            log2(4 / 11),
        )

        # non-q-gram tests
        self.assertEqual(self.cmp_ws.tanimoto_coeff('', ''), 0)
        self.assertEqual(self.cmp_ws.tanimoto_coeff('the quick', ''),
                         float('-inf'))
        self.assertEqual(self.cmp_ws.tanimoto_coeff('', 'the quick'),
                         float('-inf'))
        self.assertAlmostEqual(self.cmp_ws.tanimoto_coeff(NONQ_FROM, NONQ_TO),
                               log2(1 / 3))
        self.assertAlmostEqual(self.cmp_ws.tanimoto_coeff(NONQ_TO, NONQ_FROM),
                               log2(1 / 3))

        # Test wrapper
        self.assertAlmostEqual(tanimoto('nelson', 'neilsen'), log2(4 / 11))
示例#9
0
class DiceTestCases(unittest.TestCase):
    """Test Dice functions.

    abydos.distance.Dice
    """

    cmp = Dice()
    cmp_q2 = Dice(tokenizer=QGrams(2))
    cmp_ws = Dice(tokenizer=WhitespaceTokenizer())

    def test_dice_sim(self):
        """Test abydos.distance.Dice.sim."""
        self.assertEqual(self.cmp.sim('', ''), 1)
        self.assertEqual(self.cmp.sim('nelson', ''), 0)
        self.assertEqual(self.cmp.sim('', 'neilsen'), 0)
        self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen'), 8 / 15)

        self.assertEqual(self.cmp_q2.sim('', ''), 1)
        self.assertEqual(self.cmp_q2.sim('nelson', ''), 0)
        self.assertEqual(self.cmp_q2.sim('', 'neilsen'), 0)
        self.assertAlmostEqual(self.cmp_q2.sim('nelson', 'neilsen'), 8 / 15)

        # supplied q-gram tests
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            1,
        )
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            0,
        )
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            0,
        )
        self.assertAlmostEqual(
            self.cmp.sim(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            8 / 15,
        )

        # non-q-gram tests
        self.assertEqual(self.cmp_ws.sim('', ''), 1)
        self.assertEqual(self.cmp_ws.sim('the quick', ''), 0)
        self.assertEqual(self.cmp_ws.sim('', 'the quick'), 0)
        self.assertAlmostEqual(self.cmp_ws.sim(NONQ_FROM, NONQ_TO), 1 / 2)
        self.assertAlmostEqual(self.cmp_ws.sim(NONQ_TO, NONQ_FROM), 1 / 2)

    def test_dice_dist(self):
        """Test abydos.distance.Dice.dist."""
        self.assertEqual(self.cmp.dist('', ''), 0)
        self.assertEqual(self.cmp.dist('nelson', ''), 1)
        self.assertEqual(self.cmp.dist('', 'neilsen'), 1)
        self.assertAlmostEqual(self.cmp.dist('nelson', 'neilsen'), 7 / 15)

        self.assertEqual(self.cmp_q2.dist('', ''), 0)
        self.assertEqual(self.cmp_q2.dist('nelson', ''), 1)
        self.assertEqual(self.cmp_q2.dist('', 'neilsen'), 1)
        self.assertAlmostEqual(self.cmp_q2.dist('nelson', 'neilsen'), 7 / 15)

        # supplied q-gram tests
        self.assertEqual(
            self.cmp.dist(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            0,
        )
        self.assertEqual(
            self.cmp.dist(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            1,
        )
        self.assertEqual(
            self.cmp.dist(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            1,
        )
        self.assertAlmostEqual(
            self.cmp.dist(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            7 / 15,
        )

        # non-q-gram tests
        self.assertEqual(self.cmp_ws.dist('', ''), 0)
        self.assertEqual(self.cmp_ws.dist('the quick', ''), 1)
        self.assertEqual(self.cmp_ws.dist('', 'the quick'), 1)
        self.assertAlmostEqual(self.cmp_ws.dist(NONQ_FROM, NONQ_TO), 1 / 2)
        self.assertAlmostEqual(self.cmp_ws.dist(NONQ_TO, NONQ_FROM), 1 / 2)
示例#10
0
    def test_token_distance(self):
        """Test abydos.distance._TokenDistance members."""
        self.assertAlmostEqual(
            Jaccard(intersection_type='soft', alphabet=24).sim(
                'ATCAACGAGT', 'AACGATTAG'
            ),
            0.68,
        )
        self.assertAlmostEqual(
            Jaccard(qval=1, alphabet='CGAT').sim('ATCAACGAGT', 'AACGATTAG'),
            0.9,
        )
        self.assertAlmostEqual(
            Jaccard(tokenizer=QSkipgrams(qval=3), alphabet='CGAT').sim(
                'ATCAACGAGT', 'AACGATTAG'
            ),
            0.6372795969773299,
        )
        self.assertAlmostEqual(
            Jaccard(alphabet=None).sim('synonym', 'antonym'),
            0.3333333333333333,
        )
        self.assertAlmostEqual(
            Jaccard(tokenizer=QSkipgrams(qval=3)).sim('synonym', 'antonym'),
            0.34146341463414637,
        )

        src_ctr = Counter({'a': 5, 'b': 2, 'c': 10})
        tar_ctr = Counter({'a': 2, 'c': 1, 'd': 3, 'e': 12})
        self.assertAlmostEqual(Jaccard().sim(src_ctr, tar_ctr), 0.09375)

        self.assertAlmostEqual(
            SokalMichener(normalizer='proportional').sim('synonym', 'antonym'),
            0.984777917351113,
        )
        self.assertAlmostEqual(
            SokalMichener(normalizer='log').sim('synonym', 'antonym'),
            1.2385752469545532,
        )
        self.assertAlmostEqual(
            SokalMichener(normalizer='exp', alphabet=0).sim(
                'synonym', 'antonym'
            ),
            3.221246147982545e18,
        )
        self.assertAlmostEqual(
            SokalMichener(normalizer='laplace').sim('synonym', 'antonym'),
            0.98856416772554,
        )
        self.assertAlmostEqual(
            SokalMichener(normalizer='inverse').sim('synonym', 'antonym'),
            197.95790155440417,
        )
        self.assertAlmostEqual(
            SokalMichener(normalizer='complement').sim('synonym', 'antonym'),
            1.0204081632653061,
        )
        self.assertAlmostEqual(
            SokalMichener(normalizer='base case').sim('synonym', 'antonym'),
            0.9897959183673469,
        )
        self.assertAlmostEqual(
            SokalMichener().sim('synonym', 'antonym'), 0.9897959183673469
        )

        sm = SokalMichener()
        sm._tokenize('synonym', 'antonym')  # noqa: SF01

        self.assertEqual(
            sm._get_tokens(),  # noqa: SF01
            (
                Counter(
                    {
                        '$s': 1,
                        'sy': 1,
                        'yn': 1,
                        'no': 1,
                        'on': 1,
                        'ny': 1,
                        'ym': 1,
                        'm#': 1,
                    }
                ),
                Counter(
                    {
                        '$a': 1,
                        'an': 1,
                        'nt': 1,
                        'to': 1,
                        'on': 1,
                        'ny': 1,
                        'ym': 1,
                        'm#': 1,
                    }
                ),
            ),
        )
        self.assertEqual(sm._src_card(), 8)  # noqa: SF01
        self.assertEqual(sm._tar_card(), 8)  # noqa: SF01
        self.assertEqual(
            sm._symmetric_difference(),  # noqa: SF01
            Counter(
                {
                    '$s': 1,
                    'sy': 1,
                    'yn': 1,
                    'no': 1,
                    '$a': 1,
                    'an': 1,
                    'nt': 1,
                    'to': 1,
                }
            ),
        )
        self.assertEqual(sm._symmetric_difference_card(), 8)  # noqa: SF01
        self.assertEqual(sm._total_complement_card(), 772)  # noqa: SF01
        self.assertEqual(sm._population_card(), 788)  # noqa: SF01
        self.assertEqual(
            sm._union(),  # noqa: SF01
            Counter(
                {
                    '$s': 1,
                    'sy': 1,
                    'yn': 1,
                    'no': 1,
                    'on': 1,
                    'ny': 1,
                    'ym': 1,
                    'm#': 1,
                    '$a': 1,
                    'an': 1,
                    'nt': 1,
                    'to': 1,
                }
            ),
        )
        self.assertEqual(sm._union_card(), 12)  # noqa: SF01
        self.assertEqual(
            sm._difference(),  # noqa: SF01
            Counter(
                {
                    '$s': 1,
                    'sy': 1,
                    'yn': 1,
                    'no': 1,
                    'on': 0,
                    'ny': 0,
                    'ym': 0,
                    'm#': 0,
                    '$a': -1,
                    'an': -1,
                    'nt': -1,
                    'to': -1,
                }
            ),
        )
        self.assertEqual(
            sm._intersection(),  # noqa: SF01
            Counter({'on': 1, 'ny': 1, 'ym': 1, 'm#': 1}),
        )
        self.assertEqual(
            sm._get_confusion_table(),  # noqa: SF01
            ConfusionTable(tp=4, tn=772, fp=4, fn=4),
        )

        sm = SokalMichener(
            alphabet=Counter({'C': 20, 'G': 20, 'A': 20, 'T': 20}), qval=1
        )
        sm._tokenize('ATCAACGAGT', 'AACGATTAG')  # noqa: SF01
        self.assertEqual(sm._total_complement_card(), 61)  # noqa: SF01

        jac = Jaccard(
            intersection_type='linkage', internal_assignment_problem=True
        )
        self.assertAlmostEqual(jac.sim('abandonned', 'abandoned'), 1.0)
        self.assertAlmostEqual(
            jac.sim('abundacies', 'abundances'), 0.6296296296296297
        )

        # Some additional constructors needed to complete test coverage
        self.assertAlmostEqual(
            Jaccard(alphabet=None, qval=range(2, 4)).sim('abc', 'abcd'),
            0.42857142857142855,
        )
        self.assertAlmostEqual(
            AverageLinkage(qval=range(2, 4)).sim('abc', 'abcd'),
            0.22558922558922556,
        )
        self.assertAlmostEqual(
            Jaccard(alphabet='abcdefghijklmnop', qval=range(2, 4)).sim(
                'abc', 'abcd'
            ),
            0.42857142857142855,
        )
        self.assertAlmostEqual(
            Jaccard(
                alphabet='abcdefghijklmnop', tokenizer=WhitespaceTokenizer()
            ).sim('abc', 'abcd'),
            0.0,
        )
        self.assertAlmostEqual(
            Jaccard(alphabet=list('abcdefghijklmnop')).sim('abc', 'abcd'), 0.5
        )
        self.assertAlmostEqual(
            Jaccard(tokenizer=CharacterTokenizer()).sim('abc', 'abcd'), 0.75
        )
示例#11
0
class CosineSimilarityTestCases(unittest.TestCase):
    """Test cosine similarity functions.

    abydos.distance.Cosine
    """

    cmp = Cosine()
    cmp_q2 = Cosine(tokenizer=QGrams(2))
    cmp_ws = Cosine(tokenizer=WhitespaceTokenizer())

    def test_cosine_sim(self):
        """Test abydos.distance.Cosine.sim."""
        self.assertEqual(self.cmp.sim('', ''), 1)
        self.assertEqual(self.cmp.sim('nelson', ''), 0)
        self.assertEqual(self.cmp.sim('', 'neilsen'), 0)
        self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen'),
                               4 / math.sqrt(7 * 8))

        self.assertEqual(self.cmp_q2.sim('', ''), 1)
        self.assertEqual(self.cmp_q2.sim('nelson', ''), 0)
        self.assertEqual(self.cmp_q2.sim('', 'neilsen'), 0)
        self.assertAlmostEqual(self.cmp_q2.sim('nelson', 'neilsen'),
                               4 / math.sqrt(7 * 8))

        # supplied q-gram tests
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            1,
        )
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            0,
        )
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            0,
        )
        self.assertAlmostEqual(
            self.cmp.sim(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            4 / math.sqrt(7 * 8),
        )

        # non-q-gram tests
        self.assertEqual(self.cmp_ws.sim('', ''), 1)
        self.assertEqual(self.cmp_ws.sim('the quick', ''), 0)
        self.assertEqual(self.cmp_ws.sim('', 'the quick'), 0)
        self.assertAlmostEqual(self.cmp_ws.sim(NONQ_FROM, NONQ_TO),
                               4 / math.sqrt(9 * 7))
        self.assertAlmostEqual(self.cmp_ws.sim(NONQ_TO, NONQ_FROM),
                               4 / math.sqrt(9 * 7))

        self.assertEqual(self.cmp_q2.sim('eh', 'a'), 0.0)

        # Test wrapper
        self.assertAlmostEqual(sim_cosine('nelson', 'neilsen'),
                               4 / math.sqrt(7 * 8))

    def test_cosine_dist(self):
        """Test abydos.distance.Cosine.dist."""
        self.assertEqual(self.cmp.dist('', ''), 0)
        self.assertEqual(self.cmp.dist('nelson', ''), 1)
        self.assertEqual(self.cmp.dist('', 'neilsen'), 1)
        self.assertAlmostEqual(self.cmp.dist('nelson', 'neilsen'),
                               1 - (4 / math.sqrt(7 * 8)))

        self.assertEqual(self.cmp_q2.dist('', ''), 0)
        self.assertEqual(self.cmp_q2.dist('nelson', ''), 1)
        self.assertEqual(self.cmp_q2.dist('', 'neilsen'), 1)
        self.assertAlmostEqual(self.cmp_q2.dist('nelson', 'neilsen'),
                               1 - (4 / math.sqrt(7 * 8)))

        # supplied q-gram tests
        self.assertEqual(
            self.cmp.dist(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            0,
        )
        self.assertEqual(
            self.cmp.dist(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            1,
        )
        self.assertEqual(
            self.cmp.dist(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            1,
        )
        self.assertAlmostEqual(
            self.cmp.dist(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            1 - (4 / math.sqrt(7 * 8)),
        )

        # non-q-gram tests
        self.assertEqual(self.cmp_ws.dist('', ''), 0)
        self.assertEqual(self.cmp_ws.dist('the quick', ''), 1)
        self.assertEqual(self.cmp_ws.dist('', 'the quick'), 1)
        self.assertAlmostEqual(self.cmp_ws.dist(NONQ_FROM, NONQ_TO),
                               1 - 4 / math.sqrt(9 * 7))
        self.assertAlmostEqual(self.cmp_ws.dist(NONQ_TO, NONQ_FROM),
                               1 - 4 / math.sqrt(9 * 7))

        # Test wrapper
        self.assertAlmostEqual(dist_cosine('nelson', 'neilsen'),
                               1 - (4 / math.sqrt(7 * 8)))
示例#12
0
class ManhattanTestCases(unittest.TestCase):
    """Test Manhattan functions.

    abydos.distance.Manhattan
    """

    cmp = Manhattan()
    cmp_q2 = Manhattan(tokenizer=QGrams(2))
    cmp_ws = Manhattan(tokenizer=WhitespaceTokenizer())

    def test_manhattan_dist_abs(self):
        """Test abydos.distance.Manhattan.dist_abs."""
        self.assertEqual(self.cmp.dist_abs('', ''), 0)
        self.assertEqual(self.cmp.dist_abs('nelson', ''), 7)
        self.assertEqual(self.cmp.dist_abs('', 'neilsen'), 8)
        self.assertAlmostEqual(self.cmp.dist_abs('nelson', 'neilsen'), 7)

        self.assertEqual(self.cmp_q2.dist_abs('', ''), 0)
        self.assertEqual(self.cmp_q2.dist_abs('nelson', ''), 7)
        self.assertEqual(self.cmp_q2.dist_abs('', 'neilsen'), 8)
        self.assertAlmostEqual(self.cmp_q2.dist_abs('nelson', 'neilsen'), 7)

        # supplied q-gram tests
        self.assertEqual(
            self.cmp.dist_abs(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            0,
        )
        self.assertEqual(
            self.cmp.dist_abs(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            7,
        )
        self.assertEqual(
            self.cmp.dist_abs(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            8,
        )
        self.assertAlmostEqual(
            self.cmp.dist_abs(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            7,
        )

        # non-q-gram tests
        self.assertEqual(self.cmp_ws.dist_abs('', ''), 0)
        self.assertEqual(self.cmp_ws.dist_abs('the quick', ''), 2)
        self.assertEqual(self.cmp_ws.dist_abs('', 'the quick'), 2)
        self.assertAlmostEqual(self.cmp_ws.dist_abs(NONQ_FROM, NONQ_TO), 8)
        self.assertAlmostEqual(self.cmp_ws.dist_abs(NONQ_TO, NONQ_FROM), 8)

        # Test wrapper
        self.assertAlmostEqual(manhattan('nelson', 'neilsen'), 7)

    def test_manhattan_sim(self):
        """Test abydos.distance.Manhattan.sim."""
        self.assertEqual(self.cmp.sim('', ''), 1)
        self.assertEqual(self.cmp.sim('nelson', ''), 0)
        self.assertEqual(self.cmp.sim('', 'neilsen'), 0)
        self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen'), 8 / 15)

        self.assertEqual(self.cmp_q2.sim('', ''), 1)
        self.assertEqual(self.cmp_q2.sim('nelson', ''), 0)
        self.assertEqual(self.cmp_q2.sim('', 'neilsen'), 0)
        self.assertAlmostEqual(self.cmp_q2.sim('nelson', 'neilsen'), 8 / 15)

        # supplied q-gram tests
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            1,
        )
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            0,
        )
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            0,
        )
        self.assertAlmostEqual(
            self.cmp.sim(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            8 / 15,
        )

        # non-q-gram tests
        self.assertEqual(self.cmp_ws.sim('', ''), 1)
        self.assertEqual(self.cmp_ws.sim('the quick', ''), 0)
        self.assertEqual(self.cmp_ws.sim('', 'the quick'), 0)
        self.assertAlmostEqual(self.cmp_ws.sim(NONQ_FROM, NONQ_TO), 1 / 2)
        self.assertAlmostEqual(self.cmp_ws.sim(NONQ_TO, NONQ_FROM), 1 / 2)

        # Test wrapper
        self.assertAlmostEqual(sim_manhattan('nelson', 'neilsen'), 8 / 15)

    def test_manhattan_dist(self):
        """Test abydos.distance.Manhattan.dist."""
        self.assertEqual(self.cmp.dist('', ''), 0)
        self.assertEqual(self.cmp.dist('nelson', ''), 1)
        self.assertEqual(self.cmp.dist('', 'neilsen'), 1)
        self.assertAlmostEqual(self.cmp.dist('nelson', 'neilsen'), 7 / 15)

        self.assertEqual(self.cmp_q2.dist('', ''), 0)
        self.assertEqual(self.cmp_q2.dist('nelson', ''), 1)
        self.assertEqual(self.cmp_q2.dist('', 'neilsen'), 1)
        self.assertAlmostEqual(self.cmp_q2.dist('nelson', 'neilsen'), 7 / 15)

        # supplied q-gram tests
        self.assertEqual(
            self.cmp.dist(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            0,
        )
        self.assertEqual(
            self.cmp.dist(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            1,
        )
        self.assertEqual(
            self.cmp.dist(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            1,
        )
        self.assertAlmostEqual(
            self.cmp.dist(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            7 / 15,
        )

        # non-q-gram tests
        self.assertEqual(self.cmp_ws.dist('', ''), 0)
        self.assertEqual(self.cmp_ws.dist('the quick', ''), 1)
        self.assertEqual(self.cmp_ws.dist('', 'the quick'), 1)
        self.assertAlmostEqual(self.cmp_ws.dist(NONQ_FROM, NONQ_TO), 1 / 2)
        self.assertAlmostEqual(self.cmp_ws.dist(NONQ_TO, NONQ_FROM), 1 / 2)

        # Test wrapper
        self.assertAlmostEqual(dist_manhattan('nelson', 'neilsen'), 7 / 15)
示例#13
0
class OverlapTestCases(unittest.TestCase):
    """Test overlap functions.

    abydos.distance.Overlap
    """

    cmp = Overlap()
    cmp_q2 = Overlap(tokenizer=QGrams(2))
    cmp_ws = Overlap(tokenizer=WhitespaceTokenizer())

    def test_overlap_sim(self):
        """Test abydos.distance.Overlap.sim."""
        self.assertEqual(self.cmp.sim('', ''), 1)
        self.assertEqual(self.cmp.sim('nelson', ''), 0)
        self.assertEqual(self.cmp.sim('', 'neilsen'), 0)
        self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen'), 4 / 7)

        self.assertEqual(self.cmp_q2.sim('', ''), 1)
        self.assertEqual(self.cmp_q2.sim('nelson', ''), 0)
        self.assertEqual(self.cmp_q2.sim('', 'neilsen'), 0)
        self.assertAlmostEqual(self.cmp_q2.sim('nelson', 'neilsen'), 4 / 7)

        # supplied q-gram tests
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            1,
        )
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            0,
        )
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            0,
        )
        self.assertAlmostEqual(
            self.cmp.sim(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            4 / 7,
        )

        # non-q-gram tests
        self.assertEqual(self.cmp_ws.sim('', ''), 1)
        self.assertEqual(self.cmp_ws.sim('the quick', ''), 0)
        self.assertEqual(self.cmp_ws.sim('', 'the quick'), 0)
        self.assertAlmostEqual(self.cmp_ws.sim(NONQ_FROM, NONQ_TO), 4 / 7)
        self.assertAlmostEqual(self.cmp_ws.sim(NONQ_TO, NONQ_FROM), 4 / 7)

    def test_overlap_dist(self):
        """Test abydos.distance.Overlap.dist."""
        self.assertEqual(self.cmp.dist('', ''), 0)
        self.assertEqual(self.cmp.dist('nelson', ''), 1)
        self.assertEqual(self.cmp.dist('', 'neilsen'), 1)
        self.assertAlmostEqual(self.cmp.dist('nelson', 'neilsen'), 3 / 7)

        self.assertEqual(self.cmp_q2.dist('', ''), 0)
        self.assertEqual(self.cmp_q2.dist('nelson', ''), 1)
        self.assertEqual(self.cmp_q2.dist('', 'neilsen'), 1)
        self.assertAlmostEqual(self.cmp_q2.dist('nelson', 'neilsen'), 3 / 7)

        # supplied q-gram tests
        self.assertEqual(
            self.cmp.dist(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            0,
        )
        self.assertEqual(
            self.cmp.dist(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            1,
        )
        self.assertEqual(
            self.cmp.dist(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            1,
        )
        self.assertAlmostEqual(
            self.cmp.dist(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            3 / 7,
        )

        # non-q-gram tests
        self.assertEqual(self.cmp_ws.dist('', ''), 0)
        self.assertEqual(self.cmp_ws.dist('the quick', ''), 1)
        self.assertEqual(self.cmp_ws.dist('', 'the quick'), 1)
        self.assertAlmostEqual(self.cmp_ws.dist(NONQ_FROM, NONQ_TO), 3 / 7)
        self.assertAlmostEqual(self.cmp_ws.dist(NONQ_TO, NONQ_FROM), 3 / 7)
示例#14
0
from . import EXTREME_TEST, _corpus_file, _fuzz, _random_char

algorithms = {
    'corvcluster': COrVClusterTokenizer().tokenize,
    'cvcluster': CVClusterTokenizer().tokenize,
    'character': CharacterTokenizer().tokenize,
    'legalipy': LegaliPyTokenizer().tokenize,
    'nltk': NLTKTokenizer(nltk_tokenizer=TweetTokenizer()).tokenize,
    'qgrams': QGrams().tokenize,
    'qskipgrams': QSkipgrams().tokenize,
    'regexp': RegexpTokenizer().tokenize,
    'saps': SAPSTokenizer().tokenize,
    'sonoripy': SonoriPyTokenizer().tokenize,
    'vccluster': VCClusterTokenizer().tokenize,
    'whitespace': WhitespaceTokenizer().tokenize,
    'wordpunct': WordpunctTokenizer().tokenize,
}


class BigListOfNaughtyStringsTestCases(unittest.TestCase):
    """Test each tokenizer against the BLNS set.

    Here, we test each algorithm against each string, but we only care that it
    does not result in an exception.

    While not actually a fuzz test, this does serve the purpose of looking for
    errors resulting from unanticipated input.
    """
    def fuzz_test_blns(self):
        """Test each tokenizer against the BLNS set."""
示例#15
0
class TverskyIndexTestCases(unittest.TestCase):
    """Test Tversky functions.

    abydos.distance.Tversky
    """

    cmp = Tversky()
    cmp_q2 = Tversky(tokenizer=QGrams(2))
    cmp_ws = Tversky(tokenizer=WhitespaceTokenizer())

    def test_tversky_sim(self):
        """Test abydos.distance.Tversky.sim."""
        self.assertEqual(self.cmp.sim('', ''), 1)
        self.assertEqual(self.cmp.sim('nelson', ''), 0)
        self.assertEqual(self.cmp.sim('', 'neilsen'), 0)
        self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen'), 4 / 11)

        self.assertEqual(self.cmp_q2.sim('', ''), 1)
        self.assertEqual(self.cmp_q2.sim('nelson', ''), 0)
        self.assertEqual(self.cmp_q2.sim('', 'neilsen'), 0)
        self.assertAlmostEqual(self.cmp_q2.sim('nelson', 'neilsen'), 4 / 11)

        # test valid alpha & beta
        self.assertRaises(ValueError,
                          Tversky(alpha=-1.0, beta=-1.0).sim, 'abcd', 'dcba')
        self.assertRaises(ValueError,
                          Tversky(alpha=-1.0, beta=0.0).sim, 'abcd', 'dcba')
        self.assertRaises(ValueError,
                          Tversky(alpha=0.0, beta=-1.0).sim, 'abcd', 'dcba')

        # test empty QGrams
        self.assertAlmostEqual(
            Tversky(tokenizer=QGrams(7, start_stop='')).sim(
                'nelson', 'neilsen'),
            0.0,
        )

        # test unequal alpha & beta
        self.assertAlmostEqual(
            Tversky(alpha=2.0, beta=1.0,
                    tokenizer=QGrams(2)).sim('niall', 'neal'),
            3 / 11,
        )
        self.assertAlmostEqual(
            Tversky(alpha=1.0, beta=2.0,
                    tokenizer=QGrams(2)).sim('niall', 'neal'),
            3 / 10,
        )
        self.assertAlmostEqual(
            Tversky(alpha=2.0, beta=2.0,
                    tokenizer=QGrams(2)).sim('niall', 'neal'),
            3 / 13,
        )

        # test bias parameter
        self.assertAlmostEqual(
            Tversky(alpha=1.0, beta=1.0, bias=0.5,
                    tokenizer=QGrams(2)).sim('niall', 'neal'),
            7 / 11,
        )
        self.assertAlmostEqual(
            Tversky(alpha=2.0, beta=1.0, bias=0.5,
                    tokenizer=QGrams(2)).sim('niall', 'neal'),
            7 / 9,
        )
        self.assertAlmostEqual(
            Tversky(alpha=1.0, beta=2.0, bias=0.5,
                    tokenizer=QGrams(2)).sim('niall', 'neal'),
            7 / 15,
        )
        self.assertAlmostEqual(
            Tversky(alpha=2.0, beta=2.0, bias=0.5,
                    tokenizer=QGrams(2)).sim('niall', 'neal'),
            7 / 11,
        )

        # supplied q-gram tests
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            1,
        )
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            0,
        )
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            0,
        )
        self.assertAlmostEqual(
            self.cmp.sim(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            4 / 11,
        )

        # non-q-gram tests
        self.assertEqual(self.cmp_ws.sim('', ''), 1)
        self.assertEqual(self.cmp_ws.sim('the quick', ''), 0)
        self.assertEqual(self.cmp_ws.sim('', 'the quick'), 0)
        self.assertAlmostEqual(self.cmp_ws.sim(NONQ_FROM, NONQ_TO), 1 / 3)
        self.assertAlmostEqual(self.cmp_ws.sim(NONQ_TO, NONQ_FROM), 1 / 3)

        # Test wrapper
        self.assertAlmostEqual(sim_tversky('nelson', 'neilsen'), 4 / 11)

    def test_tversky_dist(self):
        """Test abydos.distance.Tversky.dist."""
        self.assertEqual(self.cmp.dist('', ''), 0)
        self.assertEqual(self.cmp.dist('nelson', ''), 1)
        self.assertEqual(self.cmp.dist('', 'neilsen'), 1)
        self.assertAlmostEqual(self.cmp.dist('nelson', 'neilsen'), 7 / 11)

        self.assertEqual(self.cmp_q2.dist('', ''), 0)
        self.assertEqual(self.cmp_q2.dist('nelson', ''), 1)
        self.assertEqual(self.cmp_q2.dist('', 'neilsen'), 1)
        self.assertAlmostEqual(self.cmp_q2.dist('nelson', 'neilsen'), 7 / 11)

        # test valid alpha & beta
        self.assertRaises(ValueError,
                          Tversky(alpha=-1.0, beta=-1.0).dist, 'abcd', 'dcba')
        self.assertRaises(ValueError,
                          Tversky(alpha=-1.0, beta=0.0).dist, 'abcd', 'dcba')
        self.assertRaises(ValueError,
                          Tversky(alpha=0.0, beta=-1.0).dist, 'abcd', 'dcba')

        # test empty QGrams
        self.assertAlmostEqual(
            Tversky(tokenizer=QGrams(7, start_stop='')).dist(
                'nelson', 'neilsen'),
            1.0,
        )

        # test unequal alpha & beta
        self.assertAlmostEqual(
            Tversky(alpha=2.0, beta=1.0,
                    tokenizer=QGrams(2)).dist('niall', 'neal'),
            8 / 11,
        )
        self.assertAlmostEqual(
            Tversky(alpha=1.0, beta=2.0,
                    tokenizer=QGrams(2)).dist('niall', 'neal'),
            7 / 10,
        )
        self.assertAlmostEqual(
            Tversky(alpha=2.0, beta=2.0,
                    tokenizer=QGrams(2)).dist('niall', 'neal'),
            10 / 13,
        )

        # test bias parameter
        self.assertAlmostEqual(
            Tversky(alpha=1.0, beta=1.0, bias=0.5,
                    tokenizer=QGrams(2)).dist('niall', 'neal'),
            4 / 11,
        )
        self.assertAlmostEqual(
            Tversky(alpha=2.0, beta=1.0, bias=0.5,
                    tokenizer=QGrams(2)).dist('niall', 'neal'),
            2 / 9,
        )
        self.assertAlmostEqual(
            Tversky(alpha=1.0, beta=2.0, bias=0.5,
                    tokenizer=QGrams(2)).dist('niall', 'neal'),
            8 / 15,
        )
        self.assertAlmostEqual(
            Tversky(alpha=2.0, beta=2.0, bias=0.5,
                    tokenizer=QGrams(2)).dist('niall', 'neal'),
            4 / 11,
        )

        # supplied q-gram tests
        self.assertEqual(
            self.cmp.dist(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            0,
        )
        self.assertEqual(
            self.cmp.dist(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            1,
        )
        self.assertEqual(
            self.cmp.dist(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            1,
        )
        self.assertAlmostEqual(
            self.cmp.dist(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            7 / 11,
        )

        # non-q-gram tests
        self.assertEqual(self.cmp_ws.dist('', ''), 0)
        self.assertEqual(self.cmp_ws.dist('the quick', ''), 1)
        self.assertEqual(self.cmp_ws.dist('', 'the quick'), 1)
        self.assertAlmostEqual(self.cmp_ws.dist(NONQ_FROM, NONQ_TO), 2 / 3)
        self.assertAlmostEqual(self.cmp_ws.dist(NONQ_TO, NONQ_FROM), 2 / 3)

        # Test wrapper
        self.assertAlmostEqual(dist_tversky('nelson', 'neilsen'), 7 / 11)