def test__tokenizer(self): """Test abydos.tokenizer._Tokenizer.""" self.assertEqual(_Tokenizer().tokenize('').get_counter(), Counter({'': 1})) self.assertEqual(_Tokenizer().tokenize('a').get_counter(), Counter({'a': 1})) self.assertEqual( _Tokenizer().tokenize('NELSON').get_counter(), Counter({'NELSON': 1}), ) self.assertEqual( _Tokenizer().tokenize('NEILSEN').get_counter(), Counter({'NEILSEN': 1}), ) self.assertEqual(_Tokenizer().tokenize('NEILSEN').count(), 1) self.assertEqual(_Tokenizer().tokenize('NEILSEN').count_unique(), 1) tweet = 'Good to be home for a night' self.assertEqual( _Tokenizer().tokenize(tweet).get_counter(), Counter({'Good to be home for a night': 1}), ) nelson = QGrams().tokenize('NELSON') neilsen = QGrams().tokenize('NEILSEN') self.assertEqual(nelson.get_set(), {'$N', 'EL', 'LS', 'N#', 'NE', 'ON', 'SO'}) self.assertEqual(nelson.get_list(), ['$N', 'NE', 'EL', 'LS', 'SO', 'ON', 'N#']) if sys.version_info >= (3, 6): self.assertEqual( repr(nelson), "QGrams({'$N': 1, 'NE': 1, 'EL': 1, 'LS': 1, 'SO': 1, 'ON': 1, \ 'N#': 1})", ) self.assertEqual(nelson & neilsen, Counter({ '$N': 1, 'NE': 1, 'LS': 1, 'N#': 1 })) self.assertEqual( nelson + neilsen, Counter({ '$N': 2, 'NE': 2, 'EL': 1, 'LS': 2, 'SO': 1, 'ON': 1, 'N#': 2, 'EI': 1, 'IL': 1, 'SE': 1, 'EN': 1, }), ) self.assertEqual(nelson - neilsen, Counter({ 'EL': 1, 'SO': 1, 'ON': 1 })) nelsonnelson = QGrams(scaler='set').tokenize('NELSONNELSON') self.assertEqual(nelsonnelson.count(), 8) nelson_ssk = QSkipgrams(scaler='SSK').tokenize('NELSON') self.assertAlmostEqual(nelson_ssk.count(), 18.66784401) nelson_log = QSkipgrams(qval=3, scaler=log1p).tokenize('NELSON') gold_standard = Counter({ '$$N': 1.0986122886681096, '$$E': 0.6931471805599453, '$$L': 0.6931471805599453, '$$S': 0.6931471805599453, '$$O': 0.6931471805599453, '$$#': 1.0986122886681096, '$NE': 1.0986122886681096, '$NL': 1.0986122886681096, '$NS': 1.0986122886681096, '$NO': 1.0986122886681096, '$NN': 1.0986122886681096, '$N#': 2.1972245773362196, '$EL': 1.0986122886681096, '$ES': 1.0986122886681096, '$EO': 1.0986122886681096, '$EN': 1.0986122886681096, '$E#': 1.6094379124341003, '$LS': 1.0986122886681096, '$LO': 1.0986122886681096, '$LN': 1.0986122886681096, '$L#': 1.6094379124341003, '$SO': 1.0986122886681096, '$SN': 1.0986122886681096, '$S#': 1.6094379124341003, '$ON': 1.0986122886681096, '$O#': 1.6094379124341003, '$##': 1.0986122886681096, 'NEL': 0.6931471805599453, 'NES': 0.6931471805599453, 'NEO': 0.6931471805599453, 'NEN': 0.6931471805599453, 'NE#': 1.0986122886681096, 'NLS': 0.6931471805599453, 'NLO': 0.6931471805599453, 'NLN': 0.6931471805599453, 'NL#': 1.0986122886681096, 'NSO': 0.6931471805599453, 'NSN': 0.6931471805599453, 'NS#': 1.0986122886681096, 'NON': 0.6931471805599453, 'NO#': 1.0986122886681096, 'NN#': 1.0986122886681096, 'N##': 1.0986122886681096, 'ELS': 0.6931471805599453, 'ELO': 0.6931471805599453, 'ELN': 0.6931471805599453, 'EL#': 1.0986122886681096, 'ESO': 0.6931471805599453, 'ESN': 0.6931471805599453, 'ES#': 1.0986122886681096, 'EON': 0.6931471805599453, 'EO#': 1.0986122886681096, 'EN#': 1.0986122886681096, 'E##': 0.6931471805599453, 'LSO': 0.6931471805599453, 'LSN': 0.6931471805599453, 'LS#': 1.0986122886681096, 'LON': 0.6931471805599453, 'LO#': 1.0986122886681096, 'LN#': 1.0986122886681096, 'L##': 0.6931471805599453, 'SON': 0.6931471805599453, 'SO#': 1.0986122886681096, 'SN#': 1.0986122886681096, 'S##': 0.6931471805599453, 'ON#': 1.0986122886681096, 'O##': 0.6931471805599453, }) test_counter = nelson_log.get_counter() for key in test_counter: self.assertAlmostEqual(test_counter[key], gold_standard[key]) nelson_entropy = QSkipgrams(scaler='entropy').tokenize('NELSON') self.assertAlmostEqual(nelson_entropy.count(), 4.6644977792)
def test_tversky_sim(self): """Test abydos.distance.Tversky.sim.""" self.assertEqual(self.cmp.sim('', ''), 1) self.assertEqual(self.cmp.sim('nelson', ''), 0) self.assertEqual(self.cmp.sim('', 'neilsen'), 0) self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen'), 4 / 11) self.assertEqual(self.cmp_q2.sim('', ''), 1) self.assertEqual(self.cmp_q2.sim('nelson', ''), 0) self.assertEqual(self.cmp_q2.sim('', 'neilsen'), 0) self.assertAlmostEqual(self.cmp_q2.sim('nelson', 'neilsen'), 4 / 11) # test valid alpha & beta self.assertRaises(ValueError, Tversky(alpha=-1.0, beta=-1.0).sim, 'abcd', 'dcba') self.assertRaises(ValueError, Tversky(alpha=-1.0, beta=0.0).sim, 'abcd', 'dcba') self.assertRaises(ValueError, Tversky(alpha=0.0, beta=-1.0).sim, 'abcd', 'dcba') # test empty QGrams self.assertAlmostEqual( Tversky(tokenizer=QGrams(7, start_stop='')).sim( 'nelson', 'neilsen'), 0.0, ) # test unequal alpha & beta self.assertAlmostEqual( Tversky(alpha=2.0, beta=1.0, tokenizer=QGrams(2)).sim('niall', 'neal'), 3 / 11, ) self.assertAlmostEqual( Tversky(alpha=1.0, beta=2.0, tokenizer=QGrams(2)).sim('niall', 'neal'), 3 / 10, ) self.assertAlmostEqual( Tversky(alpha=2.0, beta=2.0, tokenizer=QGrams(2)).sim('niall', 'neal'), 3 / 13, ) # test bias parameter self.assertAlmostEqual( Tversky(alpha=1.0, beta=1.0, bias=0.5, tokenizer=QGrams(2)).sim('niall', 'neal'), 7 / 11, ) self.assertAlmostEqual( Tversky(alpha=2.0, beta=1.0, bias=0.5, tokenizer=QGrams(2)).sim('niall', 'neal'), 7 / 9, ) self.assertAlmostEqual( Tversky(alpha=1.0, beta=2.0, bias=0.5, tokenizer=QGrams(2)).sim('niall', 'neal'), 7 / 15, ) self.assertAlmostEqual( Tversky(alpha=2.0, beta=2.0, bias=0.5, tokenizer=QGrams(2)).sim('niall', 'neal'), 7 / 11, ) # supplied q-gram tests self.assertEqual( self.cmp.sim( QGrams().tokenize('').get_counter(), QGrams().tokenize('').get_counter(), ), 1, ) self.assertEqual( self.cmp.sim( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('').get_counter(), ), 0, ) self.assertEqual( self.cmp.sim( QGrams().tokenize('').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 0, ) self.assertAlmostEqual( self.cmp.sim( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 4 / 11, ) # non-q-gram tests self.assertEqual(self.cmp_ws.sim('', ''), 1) self.assertEqual(self.cmp_ws.sim('the quick', ''), 0) self.assertEqual(self.cmp_ws.sim('', 'the quick'), 0) self.assertAlmostEqual(self.cmp_ws.sim(NONQ_FROM, NONQ_TO), 1 / 3) self.assertAlmostEqual(self.cmp_ws.sim(NONQ_TO, NONQ_FROM), 1 / 3) # Test wrapper self.assertAlmostEqual(sim_tversky('nelson', 'neilsen'), 4 / 11)
VCClusterTokenizer, WhitespaceTokenizer, WordpunctTokenizer, ) from nltk import TweetTokenizer from . import EXTREME_TEST, _corpus_file, _fuzz, _random_char algorithms = { 'corvcluster': COrVClusterTokenizer().tokenize, 'cvcluster': CVClusterTokenizer().tokenize, 'character': CharacterTokenizer().tokenize, 'legalipy': LegaliPyTokenizer().tokenize, 'nltk': NLTKTokenizer(nltk_tokenizer=TweetTokenizer()).tokenize, 'qgrams': QGrams().tokenize, 'qskipgrams': QSkipgrams().tokenize, 'regexp': RegexpTokenizer().tokenize, 'saps': SAPSTokenizer().tokenize, 'sonoripy': SonoriPyTokenizer().tokenize, 'vccluster': VCClusterTokenizer().tokenize, 'whitespace': WhitespaceTokenizer().tokenize, 'wordpunct': WordpunctTokenizer().tokenize, } class BigListOfNaughtyStringsTestCases(unittest.TestCase): """Test each tokenizer against the BLNS set. Here, we test each algorithm against each string, but we only care that it does not result in an exception.
class CosineSimilarityTestCases(unittest.TestCase): """Test cosine similarity functions. abydos.distance.Cosine """ cmp = Cosine() cmp_q2 = Cosine(tokenizer=QGrams(2)) cmp_ws = Cosine(tokenizer=WhitespaceTokenizer()) def test_cosine_sim(self): """Test abydos.distance.Cosine.sim.""" self.assertEqual(self.cmp.sim('', ''), 1) self.assertEqual(self.cmp.sim('nelson', ''), 0) self.assertEqual(self.cmp.sim('', 'neilsen'), 0) self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen'), 4 / math.sqrt(7 * 8)) self.assertEqual(self.cmp_q2.sim('', ''), 1) self.assertEqual(self.cmp_q2.sim('nelson', ''), 0) self.assertEqual(self.cmp_q2.sim('', 'neilsen'), 0) self.assertAlmostEqual(self.cmp_q2.sim('nelson', 'neilsen'), 4 / math.sqrt(7 * 8)) # supplied q-gram tests self.assertEqual( self.cmp.sim( QGrams().tokenize('').get_counter(), QGrams().tokenize('').get_counter(), ), 1, ) self.assertEqual( self.cmp.sim( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('').get_counter(), ), 0, ) self.assertEqual( self.cmp.sim( QGrams().tokenize('').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 0, ) self.assertAlmostEqual( self.cmp.sim( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 4 / math.sqrt(7 * 8), ) # non-q-gram tests self.assertEqual(self.cmp_ws.sim('', ''), 1) self.assertEqual(self.cmp_ws.sim('the quick', ''), 0) self.assertEqual(self.cmp_ws.sim('', 'the quick'), 0) self.assertAlmostEqual(self.cmp_ws.sim(NONQ_FROM, NONQ_TO), 4 / math.sqrt(9 * 7)) self.assertAlmostEqual(self.cmp_ws.sim(NONQ_TO, NONQ_FROM), 4 / math.sqrt(9 * 7)) self.assertEqual(self.cmp_q2.sim('eh', 'a'), 0.0) # Test wrapper self.assertAlmostEqual(sim_cosine('nelson', 'neilsen'), 4 / math.sqrt(7 * 8)) def test_cosine_dist(self): """Test abydos.distance.Cosine.dist.""" self.assertEqual(self.cmp.dist('', ''), 0) self.assertEqual(self.cmp.dist('nelson', ''), 1) self.assertEqual(self.cmp.dist('', 'neilsen'), 1) self.assertAlmostEqual(self.cmp.dist('nelson', 'neilsen'), 1 - (4 / math.sqrt(7 * 8))) self.assertEqual(self.cmp_q2.dist('', ''), 0) self.assertEqual(self.cmp_q2.dist('nelson', ''), 1) self.assertEqual(self.cmp_q2.dist('', 'neilsen'), 1) self.assertAlmostEqual(self.cmp_q2.dist('nelson', 'neilsen'), 1 - (4 / math.sqrt(7 * 8))) # supplied q-gram tests self.assertEqual( self.cmp.dist( QGrams().tokenize('').get_counter(), QGrams().tokenize('').get_counter(), ), 0, ) self.assertEqual( self.cmp.dist( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('').get_counter(), ), 1, ) self.assertEqual( self.cmp.dist( QGrams().tokenize('').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 1, ) self.assertAlmostEqual( self.cmp.dist( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 1 - (4 / math.sqrt(7 * 8)), ) # non-q-gram tests self.assertEqual(self.cmp_ws.dist('', ''), 0) self.assertEqual(self.cmp_ws.dist('the quick', ''), 1) self.assertEqual(self.cmp_ws.dist('', 'the quick'), 1) self.assertAlmostEqual(self.cmp_ws.dist(NONQ_FROM, NONQ_TO), 1 - 4 / math.sqrt(9 * 7)) self.assertAlmostEqual(self.cmp_ws.dist(NONQ_TO, NONQ_FROM), 1 - 4 / math.sqrt(9 * 7)) # Test wrapper self.assertAlmostEqual(dist_cosine('nelson', 'neilsen'), 1 - (4 / math.sqrt(7 * 8)))
class FuzzyWuzzyTokenSetTestCases(unittest.TestCase): """Test FuzzyWuzzyTokenSet functions. abydos.distance.FuzzyWuzzyTokenSet """ cmp = FuzzyWuzzyTokenSet() cmp_q2 = FuzzyWuzzyTokenSet(tokenizer=QGrams(qval=2)) def test_fuzzywuzzy_token_set_sim(self): """Test abydos.distance.FuzzyWuzzyTokenSet.sim.""" # Base cases self.assertEqual(self.cmp.sim('', ''), 1.0) self.assertEqual(self.cmp.sim('a', ''), 1.0) self.assertEqual(self.cmp.sim('', 'a'), 1.0) self.assertEqual(self.cmp.sim('abc', ''), 1.0) self.assertEqual(self.cmp.sim('', 'abc'), 1.0) self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.3333333333333333) self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.6666666667) self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.6666666667) self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.8333333333) self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.8333333333) self.assertAlmostEqual(self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.6666666667) # tests from blog self.assertEqual( self.cmp.sim( 'mariners vs angels', 'los angeles angels of anaheim at seattle mariners', ), 0.9411764705882353, ) self.assertEqual(self.cmp.sim('Sirhan, Sirhan', 'Sirhan'), 1.0) # q2 tokenizer self.assertAlmostEqual(self.cmp_q2.sim('ATCAACGAGT', 'AACGATTAG'), 0.84) self.assertAlmostEqual(self.cmp_q2.sim('YANKEES', 'NEW YORK YANKEES'), 0.9545454545454546) self.assertAlmostEqual( self.cmp_q2.sim('NEW YORK METS', 'NEW YORK YANKEES'), 0.8450704225352113, ) self.assertAlmostEqual( self.cmp_q2.sim( 'New York Mets vs Atlanta Braves', 'Atlanta Braves vs New York Mets', ), 0.9782608695652174, ) def test_fuzzywuzzy_token_set_dist(self): """Test abydos.distance.FuzzyWuzzyTokenSet.dist.""" # Base cases self.assertEqual(self.cmp.dist('', ''), 0.0) self.assertEqual(self.cmp.dist('a', ''), 0.0) self.assertEqual(self.cmp.dist('', 'a'), 0.0) self.assertEqual(self.cmp.dist('abc', ''), 0.0) self.assertEqual(self.cmp.dist('', 'abc'), 0.0) self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.6666666666666667) self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.3333333333) self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.3333333333) self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.1666666667) self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.1666666667) self.assertAlmostEqual(self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.3333333333)
class Sift4ExtendedTestCases(unittest.TestCase): """Test Sift4Extended functions. abydos.distance.Sift4Extended """ ltamc = Sift4Extended.longer_transpositions_are_more_costly cmp = Sift4Extended() cmp_kwargs = Sift4Extended( tokenizer=QGrams(qval=2), token_matcher=Sift4Extended.sift4_token_matcher, matching_evaluator=Sift4Extended.sift4_matching_evaluator, local_length_evaluator=Sift4Extended.reward_length_evaluator, transposition_cost_evaluator=ltamc, transpositions_evaluator=lambda lcss, trans: lcss - trans, ) cmp_kwargs2 = Sift4Extended( local_length_evaluator=Sift4Extended.reward_length_evaluator_exp ) cmp_md = Sift4Extended(max_distance=3) def test_sift4_extended_dist_abs(self): """Test abydos.distance.Sift4Extended.dist_abs.""" # Base cases self.assertEqual(self.cmp.dist_abs('', ''), 0) self.assertEqual(self.cmp.dist_abs('a', ''), 1) self.assertEqual(self.cmp.dist_abs('', 'a'), 1) self.assertEqual(self.cmp.dist_abs('abc', ''), 3) self.assertEqual(self.cmp.dist_abs('', 'abc'), 3) self.assertEqual(self.cmp.dist_abs('abc', 'abc'), 0) self.assertEqual(self.cmp.dist_abs('abcd', 'efgh'), 4) self.assertAlmostEqual(self.cmp.dist_abs('Nigel', 'Niall'), 2) self.assertAlmostEqual(self.cmp.dist_abs('Niall', 'Nigel'), 2) self.assertAlmostEqual(self.cmp.dist_abs('Colin', 'Coiln'), 1) self.assertAlmostEqual(self.cmp.dist_abs('Coiln', 'Colin'), 1) self.assertAlmostEqual(self.cmp.dist_abs('ATCAACGAGT', 'AACGATTAG'), 4) self.assertEqual(self.cmp_kwargs.dist_abs('', ''), 0) self.assertEqual(self.cmp_kwargs.dist_abs('a', ''), 2) self.assertEqual(self.cmp_kwargs.dist_abs('', 'a'), 2) self.assertEqual(self.cmp_kwargs.dist_abs('abc', ''), 4) self.assertEqual(self.cmp_kwargs.dist_abs('', 'abc'), 4) self.assertEqual(self.cmp_kwargs.dist_abs('abc', 'abc'), -1) self.assertEqual(self.cmp_kwargs.dist_abs('abcd', 'efgh'), -2) self.assertAlmostEqual(self.cmp_kwargs.dist_abs('Nigel', 'Niall'), 1) self.assertAlmostEqual(self.cmp_kwargs.dist_abs('Niall', 'Nigel'), 1) self.assertAlmostEqual(self.cmp_kwargs.dist_abs('Colin', 'Coiln'), 1) self.assertAlmostEqual(self.cmp_kwargs.dist_abs('Coiln', 'Colin'), 1) self.assertAlmostEqual( self.cmp_kwargs.dist_abs('ATCAACGAGT', 'AACGATTAG'), 2 ) self.assertEqual(self.cmp_kwargs2.dist_abs('abc', 'abc'), 0) self.assertEqual(self.cmp_kwargs2.dist_abs('abcd', 'efgh'), 8) self.assertAlmostEqual(self.cmp_kwargs2.dist_abs('Nigel', 'Niall'), 7) self.assertAlmostEqual(self.cmp_kwargs2.dist_abs('Niall', 'Nigel'), 7) self.assertAlmostEqual(self.cmp_kwargs2.dist_abs('Colin', 'Coiln'), 6) self.assertAlmostEqual(self.cmp_kwargs2.dist_abs('Coiln', 'Colin'), 6) self.assertAlmostEqual( self.cmp_kwargs2.dist_abs('ATCAACGAGT', 'AACGATTAG'), 25 ) # coverage completion self.assertAlmostEqual( self.cmp_kwargs.dist_abs('beaurocracy', 'bureaucracy'), 3 ) self.assertAlmostEqual( self.cmp_md.dist_abs('beaurocratically', 'bureaucracy'), 3 ) self.assertAlmostEqual( self.cmp_md.dist_abs('bureaucracy', 'bureaucracy'), 3 )
class MinkowskiTestCases(unittest.TestCase): """Test Minkowski functions. abydos.distance.Minkowski """ cmp = Minkowski() cmp_q2 = Minkowski(tokenizer=QGrams(2)) cmp_q1p0 = Minkowski(pval=0, tokenizer=QGrams(1)) cmp_ws = Minkowski(tokenizer=WhitespaceTokenizer()) def test_minkowski_dist_abs(self): """Test abydos.distance.Minkowski.dist_abs.""" self.assertEqual(self.cmp.dist_abs('', ''), 0) self.assertEqual(self.cmp.dist_abs('nelson', ''), 7) self.assertEqual(self.cmp.dist_abs('', 'neilsen'), 8) self.assertEqual(self.cmp.dist_abs('nelson', 'neilsen'), 7) self.assertEqual(self.cmp_q2.dist_abs('', ''), 0) self.assertEqual(self.cmp_q2.dist_abs('nelson', ''), 7) self.assertEqual(self.cmp_q2.dist_abs('', 'neilsen'), 8) self.assertEqual(self.cmp_q2.dist_abs('nelson', 'neilsen'), 7) # supplied q-gram tests self.assertEqual( self.cmp.dist_abs( QGrams().tokenize('').get_counter(), QGrams().tokenize('').get_counter(), ), 0, ) self.assertEqual( self.cmp.dist_abs( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('').get_counter(), ), 7, ) self.assertEqual( self.cmp.dist_abs( QGrams().tokenize('').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 8, ) self.assertEqual( self.cmp.dist_abs( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 7, ) # non-q-gram tests self.assertEqual(self.cmp_ws.dist_abs('', ''), 0) self.assertEqual(self.cmp_ws.dist_abs('the quick', ''), 2) self.assertEqual(self.cmp_ws.dist_abs('', 'the quick'), 2) self.assertEqual(self.cmp_ws.dist_abs(NONQ_FROM, NONQ_TO), 8) self.assertEqual(self.cmp_ws.dist_abs(NONQ_TO, NONQ_FROM), 8) # test l_0 "norm" self.assertEqual(self.cmp_q1p0.dist_abs('', ''), 0) self.assertEqual(self.cmp_q1p0.dist_abs('a', ''), 1) self.assertEqual(self.cmp_q1p0.dist_abs('a', 'b'), 2) self.assertEqual(self.cmp_q1p0.dist_abs('ab', 'b'), 1) self.assertEqual(self.cmp_q1p0.dist_abs('aab', 'b'), 1) self.assertEqual(self.cmp_q1p0.dist_abs('', '', normalized=True), 0) self.assertEqual(self.cmp_q1p0.dist_abs('a', '', normalized=True), 1) self.assertEqual(self.cmp_q1p0.dist_abs('a', 'b', normalized=True), 1) self.assertEqual(self.cmp_q1p0.dist_abs('ab', 'b', normalized=True), 1 / 2) self.assertEqual(self.cmp_q1p0.dist_abs('aab', 'b', normalized=True), 1 / 2) self.assertEqual(self.cmp_q1p0.dist_abs('aaab', 'b', normalized=True), 1 / 2) self.assertEqual(self.cmp_q1p0.dist_abs('aaab', 'ab', normalized=True), 1 / 2) # test with alphabet self.assertEqual( Minkowski(tokenizer=QGrams(1), alphabet=26).dist_abs('ab', 'b'), 1) self.assertEqual( Minkowski(tokenizer=QGrams(1), alphabet=26).dist_abs('ab', 'b', normalized=True), 1 / 26, ) self.assertEqual( Minkowski(tokenizer=QGrams(1), alphabet='abcdefghijklmnopqrstuvwxyz').dist_abs( 'ab', 'b', normalized=True), 1 / 26, ) self.assertEqual( Minkowski(pval=float('inf')).dist_abs('nelsonian', 'neilsen'), 1.0) # Test wrapper self.assertAlmostEqual(minkowski('nelson', 'neilsen'), 7) def test_minkowski_sim(self): """Test abydos.distance.Minkowski.sim.""" self.assertEqual(self.cmp.sim('', ''), 1) self.assertEqual(self.cmp.sim('nelson', ''), 0) self.assertEqual(self.cmp.sim('', 'neilsen'), 0) self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen'), 8 / 15) self.assertEqual(self.cmp_q2.sim('', ''), 1) self.assertEqual(self.cmp_q2.sim('nelson', ''), 0) self.assertEqual(self.cmp_q2.sim('', 'neilsen'), 0) self.assertAlmostEqual(self.cmp_q2.sim('nelson', 'neilsen'), 8 / 15) # supplied q-gram tests self.assertEqual( self.cmp.sim( QGrams().tokenize('').get_counter(), QGrams().tokenize('').get_counter(), ), 1, ) self.assertEqual( self.cmp.sim( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('').get_counter(), ), 0, ) self.assertEqual( self.cmp.sim( QGrams().tokenize('').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 0, ) self.assertAlmostEqual( self.cmp.sim( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 8 / 15, ) # non-q-gram tests self.assertEqual(self.cmp_ws.sim('', ''), 1) self.assertEqual(self.cmp_ws.sim('the quick', ''), 0) self.assertEqual(self.cmp_ws.sim('', 'the quick'), 0) self.assertAlmostEqual(self.cmp_ws.sim(NONQ_FROM, NONQ_TO), 1 / 2) self.assertAlmostEqual(self.cmp_ws.sim(NONQ_TO, NONQ_FROM), 1 / 2) # Test wrapper self.assertAlmostEqual(sim_minkowski('nelson', 'neilsen'), 8 / 15) def test_minkowski_dist(self): """Test abydos.distance.Minkowski.dist.""" self.assertEqual(self.cmp.dist('', ''), 0) self.assertEqual(self.cmp.dist('nelson', ''), 1) self.assertEqual(self.cmp.dist('', 'neilsen'), 1) self.assertAlmostEqual(self.cmp.dist('nelson', 'neilsen'), 7 / 15) self.assertEqual(self.cmp_q2.dist('', ''), 0) self.assertEqual(self.cmp_q2.dist('nelson', ''), 1) self.assertEqual(self.cmp_q2.dist('', 'neilsen'), 1) self.assertAlmostEqual(self.cmp_q2.dist('nelson', 'neilsen'), 7 / 15) # supplied q-gram tests self.assertEqual( self.cmp.dist( QGrams().tokenize('').get_counter(), QGrams().tokenize('').get_counter(), ), 0, ) self.assertEqual( self.cmp.dist( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('').get_counter(), ), 1, ) self.assertEqual( self.cmp.dist( QGrams().tokenize('').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 1, ) self.assertAlmostEqual( self.cmp.dist( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 7 / 15, ) # non-q-gram tests self.assertEqual(self.cmp_ws.dist('', ''), 0) self.assertEqual(self.cmp_ws.dist('the quick', ''), 1) self.assertEqual(self.cmp_ws.dist('', 'the quick'), 1) self.assertAlmostEqual(self.cmp_ws.dist(NONQ_FROM, NONQ_TO), 1 / 2) self.assertAlmostEqual(self.cmp_ws.dist(NONQ_TO, NONQ_FROM), 1 / 2) # Test wrapper self.assertAlmostEqual(dist_minkowski('nelson', 'neilsen'), 7 / 15)
def test_qgrams(self): """Test abydos.tokenizer.QGrams.""" self.assertEqual(sorted(QGrams().tokenize('').get_list()), []) self.assertEqual(sorted(QGrams(2).tokenize('a').get_list()), ['$a', 'a#']) self.assertEqual(sorted(QGrams(-1).tokenize('NELSON').get_list()), []) self.assertEqual( sorted(QGrams(3).tokenize('NELSON').get_list()), sorted(['$$N', '$NE', 'NEL', 'ELS', 'LSO', 'SON', 'ON#', 'N##']), ) self.assertEqual( sorted(QGrams(7).tokenize('NELSON').get_list()), sorted([ '$$$$$$N', '$$$$$NE', '$$$$NEL', '$$$NELS', '$$NELSO', '$NELSON', 'ELSON##', 'LSON###', 'N######', 'NELSON#', 'ON#####', 'SON####', ]), ) # http://www.sound-ex.com/alternative_qgram.htm self.assertEqual( sorted(QGrams().tokenize('NELSON').get_list()), sorted(['$N', 'NE', 'EL', 'LS', 'SO', 'ON', 'N#']), ) self.assertEqual( sorted(QGrams().tokenize('NEILSEN').get_list()), sorted(['$N', 'NE', 'EI', 'IL', 'LS', 'SE', 'EN', 'N#']), ) self.assertEqual( sorted(QGrams(start_stop='').tokenize('NELSON').get_list()), sorted(['NE', 'EL', 'LS', 'SO', 'ON']), ) self.assertEqual( sorted(QGrams(start_stop='').tokenize('NEILSEN').get_list()), sorted(['NE', 'EI', 'IL', 'LS', 'SE', 'EN']), ) # qval=(1,2) self.assertEqual( sorted(QGrams(qval=(1, 2)).tokenize('NELSON').get_list()), sorted([ '$N', 'E', 'EL', 'L', 'LS', 'N', 'N', 'N#', 'NE', 'O', 'ON', 'S', 'SO', ]), ) self.assertEqual( sorted(QGrams(qval=(2, 1)).tokenize('NELSON').get_list()), sorted([ '$N', 'E', 'EL', 'L', 'LS', 'N', 'N', 'N#', 'NE', 'O', 'ON', 'S', 'SO', ]), ) self.assertEqual( sorted(QGrams(qval=range(3)).tokenize('NELSON').get_list()), sorted([ '$N', 'E', 'EL', 'L', 'LS', 'N', 'N', 'N#', 'NE', 'O', 'ON', 'S', 'SO', ]), ) self.assertEqual(QGrams(qval=(1, 2)).tokenize('NELSON').count(), 13) # skip=(1,2) self.assertEqual( sorted(QGrams(skip=(2, 1, 0)).tokenize('NELSON').get_list()), sorted([ '$E', '$L', '$N', 'EL', 'EO', 'ES', 'LN', 'LO', 'LS', 'N', 'N', 'N#', 'NE', 'NL', 'NS', 'O', 'O#', 'ON', 'S#', 'SN', 'SO', ]), ) self.assertEqual( sorted(QGrams(skip=(2, 1, 0)).tokenize('NELSON').get_list()), sorted([ '$E', '$L', '$N', 'EL', 'EO', 'ES', 'LN', 'LO', 'LS', 'N', 'N', 'N#', 'NE', 'NL', 'NS', 'O', 'O#', 'ON', 'S#', 'SN', 'SO', ]), ) self.assertEqual( sorted(QGrams(skip=range(3)).tokenize('NELSON').get_list()), sorted([ '$E', '$L', '$N', 'EL', 'EO', 'ES', 'LN', 'LO', 'LS', 'N', 'N', 'N#', 'NE', 'NL', 'NS', 'O', 'O#', 'ON', 'S#', 'SN', 'SO', ]), ) self.assertEqual(QGrams(skip=(0, 1, 2)).tokenize('NELSON').count(), 21) self.assertEqual( QGrams(qval=1).tokenize('COLIN').get_counter(), Counter({ 'C': 1, 'O': 1, 'L': 1, 'I': 1, 'N': 1 }), ) self.assertEqual( QGrams(qval=10, start_stop='').tokenize('COLIN').get_counter(), Counter({}), ) if sys.version_info >= (3, 6): self.assertEqual( repr(QGrams(qval=1).tokenize('COLIN')), "QGrams({'C': 1, 'O': 1, 'L': 1, 'I': 1, 'N': 1})", ) self.assertEqual( QGrams(qval=1).tokenize('COLIN').get_set(), {'C', 'O', 'L', 'I', 'N'}, ) # Test exception self.assertRaises(ValueError, QGrams, 0)
class ChebyshevTestCases(unittest.TestCase): """Test Chebyshev functions. abydos.distance.Chebyshev """ cmp = Chebyshev() cmp_q2 = Chebyshev(tokenizer=QGrams(2)) cmp_ws = Chebyshev(tokenizer=WhitespaceTokenizer()) def test_chebyshev_dist_abs(self): """Test abydos.distance.Chebyshev.dist_abs.""" self.assertEqual(self.cmp.dist_abs('', ''), 0) self.assertEqual(self.cmp.dist_abs('nelson', ''), 1) self.assertEqual(self.cmp.dist_abs('', 'neilsen'), 1) self.assertEqual(self.cmp.dist_abs('nelson', 'neilsen'), 1) self.assertEqual(self.cmp_q2.dist_abs('', ''), 0) self.assertEqual(self.cmp_q2.dist_abs('nelson', ''), 1) self.assertEqual(self.cmp_q2.dist_abs('', 'neilsen'), 1) self.assertAlmostEqual(self.cmp_q2.dist_abs('nelson', 'neilsen'), 1) # supplied q-gram tests self.assertEqual( self.cmp.dist_abs( QGrams().tokenize('').get_counter(), QGrams().tokenize('').get_counter(), ), 0, ) self.assertEqual( self.cmp.dist_abs( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('').get_counter(), ), 1, ) self.assertEqual( self.cmp.dist_abs( QGrams().tokenize('').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 1, ) self.assertAlmostEqual( self.cmp.dist_abs( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 1, ) # non-q-gram tests self.assertEqual(self.cmp_ws.dist_abs('', ''), 0) self.assertEqual(self.cmp_ws.dist_abs('the quick', ''), 1) self.assertEqual(self.cmp_ws.dist_abs('', 'the quick'), 1) self.assertAlmostEqual(self.cmp_ws.dist_abs(NONQ_FROM, NONQ_TO), 1) self.assertAlmostEqual(self.cmp_ws.dist_abs(NONQ_TO, NONQ_FROM), 1) # Test wrapper self.assertAlmostEqual(chebyshev('nelson', 'neilsen', 2), 1) def test_chebyshev_dist(self): """Test abydos.distance.Chebyshev.dist.""" self.assertRaises(NotImplementedError, self.cmp.dist) def test_chebyshev_sim(self): """Test abydos.distance.Chebyshev.sim.""" self.assertRaises(NotImplementedError, self.cmp.sim)
def test_qgrams_intersections(self): """Test abydos.tokenizer.QGrams intersections.""" self.assertEqual( sorted(QGrams().tokenize('NELSON') & QGrams().tokenize('')), []) self.assertEqual( sorted(QGrams().tokenize('') & QGrams().tokenize('NEILSEN')), []) self.assertEqual( sorted(QGrams().tokenize('NELSON') & QGrams().tokenize('NEILSEN')), sorted(['$N', 'NE', 'LS', 'N#']), ) self.assertEqual( sorted(QGrams().tokenize('NELSON') & QGrams().tokenize('NOSLEN')), sorted(['$N', 'N#']), ) self.assertEqual( sorted(QGrams().tokenize('NAIL') & QGrams().tokenize('LIAN')), []) self.assertEqual( sorted( QGrams(start_stop='').tokenize('NELSON') & QGrams(start_stop='').tokenize('NEILSEN')), sorted(['NE', 'LS']), ) self.assertEqual( sorted( QGrams(start_stop='').tokenize('NELSON') & QGrams(start_stop='').tokenize('NOSLEN')), [], ) self.assertEqual( sorted( QGrams(start_stop='').tokenize('NAIL') & QGrams(start_stop='').tokenize('LIAN')), [], )
def test_qgrams_counts(self): """Test abydos.tokenizer.QGrams counts.""" self.assertEqual(QGrams().tokenize('').count(), 0) self.assertEqual(len(QGrams().tokenize('').get_list()), 0) self.assertEqual(QGrams().tokenize('NEILSEN').count(), 8) self.assertEqual(QGrams().tokenize('NELSON').count(), 7) self.assertEqual(QGrams(start_stop='').tokenize('NEILSEN').count(), 6) self.assertEqual(QGrams(start_stop='').tokenize('NELSON').count(), 5) self.assertEqual(len(QGrams().tokenize('NEILSEN').get_list()), 8) self.assertEqual(len(QGrams().tokenize('NELSON').get_list()), 7) self.assertEqual( len(QGrams(start_stop='').tokenize('NEILSEN').get_list()), 6) self.assertEqual( len(QGrams(start_stop='').tokenize('NELSON').get_list()), 5) self.assertEqual( QGrams(scaler='set').tokenize('ACAACACCTAG').get_counter(), Counter({ '$A': 1, 'AC': 1, 'CA': 1, 'AA': 1, 'CC': 1, 'CT': 1, 'TA': 1, 'AG': 1, 'G#': 1, }), ) gold_standard = Counter({ '$A': 0.6931471805599453, 'AC': 1.3862943611198906, 'CA': 1.0986122886681096, 'AA': 0.6931471805599453, 'CC': 0.6931471805599453, 'CT': 0.6931471805599453, 'TA': 0.6931471805599453, 'AG': 0.6931471805599453, 'G#': 0.6931471805599453, }) test_counter = (QGrams( scaler=log1p).tokenize('ACAACACCTAG').get_counter()) for key in test_counter: self.assertAlmostEqual(test_counter[key], gold_standard[key]) self.assertEqual( QGrams(scaler=log1p).tokenize('ACAACACCTAG').count_unique(), 9) tokens1 = QGrams().tokenize('ACAACACCTAG') tokens2 = QGrams().tokenize('GAAGATAC') self.assertEqual( tokens1 - tokens2, Counter({ '$A': 1, 'AC': 2, 'CA': 2, 'CC': 1, 'CT': 1, 'G#': 1 }), ) self.assertEqual( tokens1 + tokens2, Counter({ '$A': 1, 'AC': 4, 'CA': 2, 'AA': 2, 'CC': 1, 'CT': 1, 'TA': 2, 'AG': 2, 'G#': 1, '$G': 1, 'GA': 2, 'AT': 1, 'C#': 1, }), )
class SAPSTestCases(unittest.TestCase): """Test SAPS functions. abydos.distance.SAPS """ cmp = SAPS() cmp_q2 = SAPS(tokenizer=QGrams(2)) def test_saps_sim(self): """Test abydos.distance.SAPS.sim.""" # Base cases self.assertEqual(self.cmp.sim('', ''), 0.0) self.assertEqual(self.cmp.sim('a', ''), 0.0) self.assertEqual(self.cmp.sim('', 'a'), 0.0) self.assertEqual(self.cmp.sim('abc', ''), 0.0) self.assertEqual(self.cmp.sim('', 'abc'), 0.0) self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.0666666667) self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.0666666667) self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.0666666667) self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.0666666667) self.assertAlmostEqual(self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.4333333333) # Coverage self.assertAlmostEqual(self.cmp_q2.sim('Stevenson', 'Stinson'), 0.3857142857) # Examples from paper self.assertAlmostEqual(self.cmp.sim('Stevenson', 'Stinson'), 0.551724138) def test_saps_dist(self): """Test abydos.distance.SAPS.dist.""" # Base cases self.assertEqual(self.cmp.dist('', ''), 1.0) self.assertEqual(self.cmp.dist('a', ''), 1.0) self.assertEqual(self.cmp.dist('', 'a'), 1.0) self.assertEqual(self.cmp.dist('abc', ''), 1.0) self.assertEqual(self.cmp.dist('', 'abc'), 1.0) self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.9333333333) self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.9333333333) self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.9333333333) self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.9333333333) self.assertAlmostEqual(self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.5666666667) # Coverage self.assertAlmostEqual(self.cmp_q2.dist('Stevenson', 'Stinson'), 0.614285714) # Examples from paper self.assertAlmostEqual(self.cmp.dist('Stevenson', 'Stinson'), 0.448275862) def test_saps_sim_score(self): """Test abydos.distance.SAPS.sim_score.""" # Base cases self.assertEqual(self.cmp.sim_score('', ''), 0) self.assertEqual(self.cmp.sim_score('a', ''), -3) self.assertEqual(self.cmp.sim_score('', 'a'), -3) self.assertEqual(self.cmp.sim_score('abc', ''), -7) self.assertEqual(self.cmp.sim_score('', 'abc'), -7) self.assertEqual(self.cmp.sim_score('abc', 'abc'), 13) self.assertEqual(self.cmp.sim_score('abcd', 'efgh'), -7) self.assertAlmostEqual(self.cmp.sim_score('Nigel', 'Niall'), 1) self.assertAlmostEqual(self.cmp.sim_score('Niall', 'Nigel'), 1) self.assertAlmostEqual(self.cmp.sim_score('Colin', 'Coiln'), 1) self.assertAlmostEqual(self.cmp.sim_score('Coiln', 'Colin'), 1) self.assertAlmostEqual(self.cmp.sim_score('ATCAACGAGT', 'AACGATTAG'), 13) # Coverage self.assertEqual(self.cmp_q2.sim_score('Stevenson', 'Stinson'), 27) # Examples from paper self.assertEqual(self.cmp.sim_score('Stevenson', 'Stinson'), 16)
def test__tokenizer(self): """Test abydos.tokenizer._Tokenizer.""" self.assertEqual( _Tokenizer().tokenize('').get_counter(), Counter({'': 1}) ) self.assertEqual( _Tokenizer().tokenize('a').get_counter(), Counter({'a': 1}) ) self.assertEqual( _Tokenizer().tokenize('NELSON').get_counter(), Counter({'NELSON': 1}), ) self.assertEqual( _Tokenizer().tokenize('NEILSEN').get_counter(), Counter({'NEILSEN': 1}), ) self.assertEqual(_Tokenizer().tokenize('NEILSEN').count(), 1) self.assertEqual(_Tokenizer().tokenize('NEILSEN').count_unique(), 1) tweet = 'Good to be home for a night' self.assertEqual( _Tokenizer().tokenize(tweet).get_counter(), Counter({'Good to be home for a night': 1}), ) nelson = QGrams().tokenize('NELSON') neilsen = QGrams().tokenize('NEILSEN') self.assertEqual( nelson.get_set(), {'$N', 'EL', 'LS', 'N#', 'NE', 'ON', 'SO'} ) self.assertEqual( nelson.get_list(), ['$N', 'NE', 'EL', 'LS', 'SO', 'ON', 'N#'] ) if sys.version_info >= (3, 6): self.assertEqual( repr(nelson), "QGrams({'$N': 1, 'NE': 1, 'EL': 1, 'LS': 1, 'SO': 1, 'ON': 1, \ 'N#': 1})", ) self.assertEqual( nelson & neilsen, Counter({'$N': 1, 'NE': 1, 'LS': 1, 'N#': 1}) ) self.assertEqual( nelson + neilsen, Counter( { '$N': 2, 'NE': 2, 'EL': 1, 'LS': 2, 'SO': 1, 'ON': 1, 'N#': 2, 'EI': 1, 'IL': 1, 'SE': 1, 'EN': 1, } ), ) self.assertEqual( nelson - neilsen, Counter({'EL': 1, 'SO': 1, 'ON': 1}) ) nelsonnelson = QGrams(scaler='set').tokenize('NELSONNELSON') self.assertEqual(nelsonnelson.count(), 8) nelson_ssk = QSkipgrams(scaler='SSK').tokenize('NELSON') self.assertAlmostEqual(nelson_ssk.count(), 18.66784401) nelson_log = QSkipgrams(qval=3, scaler=log1p).tokenize('NELSON') gold_standard = Counter( { '$$N': 1.0986122886681096, '$$E': 0.6931471805599453, '$$L': 0.6931471805599453, '$$S': 0.6931471805599453, '$$O': 0.6931471805599453, '$$#': 1.0986122886681096, '$NE': 1.0986122886681096, '$NL': 1.0986122886681096, '$NS': 1.0986122886681096, '$NO': 1.0986122886681096, '$NN': 1.0986122886681096, '$N#': 2.1972245773362196, '$EL': 1.0986122886681096, '$ES': 1.0986122886681096, '$EO': 1.0986122886681096, '$EN': 1.0986122886681096, '$E#': 1.6094379124341003, '$LS': 1.0986122886681096, '$LO': 1.0986122886681096, '$LN': 1.0986122886681096, '$L#': 1.6094379124341003, '$SO': 1.0986122886681096, '$SN': 1.0986122886681096, '$S#': 1.6094379124341003, '$ON': 1.0986122886681096, '$O#': 1.6094379124341003, '$##': 1.0986122886681096, 'NEL': 0.6931471805599453, 'NES': 0.6931471805599453, 'NEO': 0.6931471805599453, 'NEN': 0.6931471805599453, 'NE#': 1.0986122886681096, 'NLS': 0.6931471805599453, 'NLO': 0.6931471805599453, 'NLN': 0.6931471805599453, 'NL#': 1.0986122886681096, 'NSO': 0.6931471805599453, 'NSN': 0.6931471805599453, 'NS#': 1.0986122886681096, 'NON': 0.6931471805599453, 'NO#': 1.0986122886681096, 'NN#': 1.0986122886681096, 'N##': 1.0986122886681096, 'ELS': 0.6931471805599453, 'ELO': 0.6931471805599453, 'ELN': 0.6931471805599453, 'EL#': 1.0986122886681096, 'ESO': 0.6931471805599453, 'ESN': 0.6931471805599453, 'ES#': 1.0986122886681096, 'EON': 0.6931471805599453, 'EO#': 1.0986122886681096, 'EN#': 1.0986122886681096, 'E##': 0.6931471805599453, 'LSO': 0.6931471805599453, 'LSN': 0.6931471805599453, 'LS#': 1.0986122886681096, 'LON': 0.6931471805599453, 'LO#': 1.0986122886681096, 'LN#': 1.0986122886681096, 'L##': 0.6931471805599453, 'SON': 0.6931471805599453, 'SO#': 1.0986122886681096, 'SN#': 1.0986122886681096, 'S##': 0.6931471805599453, 'ON#': 1.0986122886681096, 'O##': 0.6931471805599453, } ) test_counter = nelson_log.get_counter() for key in test_counter: self.assertAlmostEqual(test_counter[key], gold_standard[key])
def test_minkowski_dist_abs(self): """Test abydos.distance.Minkowski.dist_abs.""" self.assertEqual(self.cmp.dist_abs('', ''), 0) self.assertEqual(self.cmp.dist_abs('nelson', ''), 7) self.assertEqual(self.cmp.dist_abs('', 'neilsen'), 8) self.assertEqual(self.cmp.dist_abs('nelson', 'neilsen'), 7) self.assertEqual(self.cmp_q2.dist_abs('', ''), 0) self.assertEqual(self.cmp_q2.dist_abs('nelson', ''), 7) self.assertEqual(self.cmp_q2.dist_abs('', 'neilsen'), 8) self.assertEqual(self.cmp_q2.dist_abs('nelson', 'neilsen'), 7) # supplied q-gram tests self.assertEqual( self.cmp.dist_abs( QGrams().tokenize('').get_counter(), QGrams().tokenize('').get_counter(), ), 0, ) self.assertEqual( self.cmp.dist_abs( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('').get_counter(), ), 7, ) self.assertEqual( self.cmp.dist_abs( QGrams().tokenize('').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 8, ) self.assertEqual( self.cmp.dist_abs( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 7, ) # non-q-gram tests self.assertEqual(self.cmp_ws.dist_abs('', ''), 0) self.assertEqual(self.cmp_ws.dist_abs('the quick', ''), 2) self.assertEqual(self.cmp_ws.dist_abs('', 'the quick'), 2) self.assertEqual(self.cmp_ws.dist_abs(NONQ_FROM, NONQ_TO), 8) self.assertEqual(self.cmp_ws.dist_abs(NONQ_TO, NONQ_FROM), 8) # test l_0 "norm" self.assertEqual(self.cmp_q1p0.dist_abs('', ''), 0) self.assertEqual(self.cmp_q1p0.dist_abs('a', ''), 1) self.assertEqual(self.cmp_q1p0.dist_abs('a', 'b'), 2) self.assertEqual(self.cmp_q1p0.dist_abs('ab', 'b'), 1) self.assertEqual(self.cmp_q1p0.dist_abs('aab', 'b'), 1) self.assertEqual(self.cmp_q1p0.dist_abs('', '', normalized=True), 0) self.assertEqual(self.cmp_q1p0.dist_abs('a', '', normalized=True), 1) self.assertEqual(self.cmp_q1p0.dist_abs('a', 'b', normalized=True), 1) self.assertEqual(self.cmp_q1p0.dist_abs('ab', 'b', normalized=True), 1 / 2) self.assertEqual(self.cmp_q1p0.dist_abs('aab', 'b', normalized=True), 1 / 2) self.assertEqual(self.cmp_q1p0.dist_abs('aaab', 'b', normalized=True), 1 / 2) self.assertEqual(self.cmp_q1p0.dist_abs('aaab', 'ab', normalized=True), 1 / 2) # test with alphabet self.assertEqual( Minkowski(tokenizer=QGrams(1), alphabet=26).dist_abs('ab', 'b'), 1) self.assertEqual( Minkowski(tokenizer=QGrams(1), alphabet=26).dist_abs('ab', 'b', normalized=True), 1 / 26, ) self.assertEqual( Minkowski(tokenizer=QGrams(1), alphabet='abcdefghijklmnopqrstuvwxyz').dist_abs( 'ab', 'b', normalized=True), 1 / 26, ) self.assertEqual( Minkowski(pval=float('inf')).dist_abs('nelsonian', 'neilsen'), 1.0) # Test wrapper self.assertAlmostEqual(minkowski('nelson', 'neilsen'), 7)
class DiceTestCases(unittest.TestCase): """Test Dice functions. abydos.distance.Dice """ cmp = Dice() cmp_q2 = Dice(tokenizer=QGrams(2)) cmp_ws = Dice(tokenizer=WhitespaceTokenizer()) def test_dice_sim(self): """Test abydos.distance.Dice.sim.""" self.assertEqual(self.cmp.sim('', ''), 1) self.assertEqual(self.cmp.sim('nelson', ''), 0) self.assertEqual(self.cmp.sim('', 'neilsen'), 0) self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen'), 8 / 15) self.assertEqual(self.cmp_q2.sim('', ''), 1) self.assertEqual(self.cmp_q2.sim('nelson', ''), 0) self.assertEqual(self.cmp_q2.sim('', 'neilsen'), 0) self.assertAlmostEqual(self.cmp_q2.sim('nelson', 'neilsen'), 8 / 15) # supplied q-gram tests self.assertEqual( self.cmp.sim( QGrams().tokenize('').get_counter(), QGrams().tokenize('').get_counter(), ), 1, ) self.assertEqual( self.cmp.sim( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('').get_counter(), ), 0, ) self.assertEqual( self.cmp.sim( QGrams().tokenize('').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 0, ) self.assertAlmostEqual( self.cmp.sim( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 8 / 15, ) # non-q-gram tests self.assertEqual(self.cmp_ws.sim('', ''), 1) self.assertEqual(self.cmp_ws.sim('the quick', ''), 0) self.assertEqual(self.cmp_ws.sim('', 'the quick'), 0) self.assertAlmostEqual(self.cmp_ws.sim(NONQ_FROM, NONQ_TO), 1 / 2) self.assertAlmostEqual(self.cmp_ws.sim(NONQ_TO, NONQ_FROM), 1 / 2) # Test wrapper self.assertAlmostEqual(sim_dice('nelson', 'neilsen'), 8 / 15) def test_dice_dist(self): """Test abydos.distance.Dice.dist.""" self.assertEqual(self.cmp.dist('', ''), 0) self.assertEqual(self.cmp.dist('nelson', ''), 1) self.assertEqual(self.cmp.dist('', 'neilsen'), 1) self.assertAlmostEqual(self.cmp.dist('nelson', 'neilsen'), 7 / 15) self.assertEqual(self.cmp_q2.dist('', ''), 0) self.assertEqual(self.cmp_q2.dist('nelson', ''), 1) self.assertEqual(self.cmp_q2.dist('', 'neilsen'), 1) self.assertAlmostEqual(self.cmp_q2.dist('nelson', 'neilsen'), 7 / 15) # supplied q-gram tests self.assertEqual( self.cmp.dist( QGrams().tokenize('').get_counter(), QGrams().tokenize('').get_counter(), ), 0, ) self.assertEqual( self.cmp.dist( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('').get_counter(), ), 1, ) self.assertEqual( self.cmp.dist( QGrams().tokenize('').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 1, ) self.assertAlmostEqual( self.cmp.dist( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 7 / 15, ) # non-q-gram tests self.assertEqual(self.cmp_ws.dist('', ''), 0) self.assertEqual(self.cmp_ws.dist('the quick', ''), 1) self.assertEqual(self.cmp_ws.dist('', 'the quick'), 1) self.assertAlmostEqual(self.cmp_ws.dist(NONQ_FROM, NONQ_TO), 1 / 2) self.assertAlmostEqual(self.cmp_ws.dist(NONQ_TO, NONQ_FROM), 1 / 2) # Test wrapper self.assertAlmostEqual(dist_dice('nelson', 'neilsen'), 7 / 15)
class FuzzyWuzzyTokenSortTestCases(unittest.TestCase): """Test FuzzyWuzzyTokenSort functions. abydos.distance.FuzzyWuzzyTokenSort """ cmp = FuzzyWuzzyTokenSort() cmp_q2 = FuzzyWuzzyTokenSort(tokenizer=QGrams(qval=2)) def test_fuzzywuzzy_token_sort_sim(self): """Test abydos.distance.FuzzyWuzzyTokenSort.sim.""" # Base cases self.assertEqual(self.cmp.sim('', ''), 1.0) self.assertEqual(self.cmp.sim('a', ''), 0.0) self.assertEqual(self.cmp.sim('', 'a'), 0.0) self.assertEqual(self.cmp.sim('abc', ''), 0.0) self.assertEqual(self.cmp.sim('', 'abc'), 0.0) self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.6) self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.6) self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.8) self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.8) self.assertAlmostEqual(self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.6315789474) # tests from blog self.assertEqual( self.cmp.sim( 'New York Mets vs Atlanta Braves', 'Atlanta Braves vs New York Mets', ), 1.0, ) # q2 tokenizer self.assertAlmostEqual(self.cmp_q2.sim('ATCAACGAGT', 'AACGATTAG'), 0.8524590163934426) self.assertAlmostEqual(self.cmp_q2.sim('YANKEES', 'NEW YORK YANKEES'), 0.6027397260273972) self.assertAlmostEqual( self.cmp_q2.sim('NEW YORK METS', 'NEW YORK YANKEES'), 0.7692307692307693, ) self.assertAlmostEqual( self.cmp_q2.sim( 'New York Mets vs Atlanta Braves', 'Atlanta Braves vs New York Mets', ), 0.9578947368421052, ) def test_fuzzywuzzy_token_sort_dist(self): """Test abydos.distance.FuzzyWuzzyTokenSort.dist.""" # Base cases self.assertEqual(self.cmp.dist('', ''), 0.0) self.assertEqual(self.cmp.dist('a', ''), 1.0) self.assertEqual(self.cmp.dist('', 'a'), 1.0) self.assertEqual(self.cmp.dist('abc', ''), 1.0) self.assertEqual(self.cmp.dist('', 'abc'), 1.0) self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.4) self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.4) self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.2) self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.2) self.assertAlmostEqual(self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.3684210526)
class OverlapTestCases(unittest.TestCase): """Test overlap functions. abydos.distance.Overlap """ cmp = Overlap() cmp_q2 = Overlap(tokenizer=QGrams(2)) cmp_ws = Overlap(tokenizer=WhitespaceTokenizer()) def test_overlap_sim(self): """Test abydos.distance.Overlap.sim.""" self.assertEqual(self.cmp.sim('', ''), 1) self.assertEqual(self.cmp.sim('nelson', ''), 0) self.assertEqual(self.cmp.sim('', 'neilsen'), 0) self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen'), 4 / 7) self.assertEqual(self.cmp_q2.sim('', ''), 1) self.assertEqual(self.cmp_q2.sim('nelson', ''), 0) self.assertEqual(self.cmp_q2.sim('', 'neilsen'), 0) self.assertAlmostEqual(self.cmp_q2.sim('nelson', 'neilsen'), 4 / 7) # supplied q-gram tests self.assertEqual( self.cmp.sim( QGrams().tokenize('').get_counter(), QGrams().tokenize('').get_counter(), ), 1, ) self.assertEqual( self.cmp.sim( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('').get_counter(), ), 0, ) self.assertEqual( self.cmp.sim( QGrams().tokenize('').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 0, ) self.assertAlmostEqual( self.cmp.sim( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 4 / 7, ) # non-q-gram tests self.assertEqual(self.cmp_ws.sim('', ''), 1) self.assertEqual(self.cmp_ws.sim('the quick', ''), 0) self.assertEqual(self.cmp_ws.sim('', 'the quick'), 0) self.assertAlmostEqual(self.cmp_ws.sim(NONQ_FROM, NONQ_TO), 4 / 7) self.assertAlmostEqual(self.cmp_ws.sim(NONQ_TO, NONQ_FROM), 4 / 7) def test_overlap_dist(self): """Test abydos.distance.Overlap.dist.""" self.assertEqual(self.cmp.dist('', ''), 0) self.assertEqual(self.cmp.dist('nelson', ''), 1) self.assertEqual(self.cmp.dist('', 'neilsen'), 1) self.assertAlmostEqual(self.cmp.dist('nelson', 'neilsen'), 3 / 7) self.assertEqual(self.cmp_q2.dist('', ''), 0) self.assertEqual(self.cmp_q2.dist('nelson', ''), 1) self.assertEqual(self.cmp_q2.dist('', 'neilsen'), 1) self.assertAlmostEqual(self.cmp_q2.dist('nelson', 'neilsen'), 3 / 7) # supplied q-gram tests self.assertEqual( self.cmp.dist( QGrams().tokenize('').get_counter(), QGrams().tokenize('').get_counter(), ), 0, ) self.assertEqual( self.cmp.dist( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('').get_counter(), ), 1, ) self.assertEqual( self.cmp.dist( QGrams().tokenize('').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 1, ) self.assertAlmostEqual( self.cmp.dist( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 3 / 7, ) # non-q-gram tests self.assertEqual(self.cmp_ws.dist('', ''), 0) self.assertEqual(self.cmp_ws.dist('the quick', ''), 1) self.assertEqual(self.cmp_ws.dist('', 'the quick'), 1) self.assertAlmostEqual(self.cmp_ws.dist(NONQ_FROM, NONQ_TO), 3 / 7) self.assertAlmostEqual(self.cmp_ws.dist(NONQ_TO, NONQ_FROM), 3 / 7)
class EuclideanTestCases(unittest.TestCase): """Test Euclidean functions. abydos.distance.Euclidean """ cmp = Euclidean() cmp_q2 = Euclidean(tokenizer=QGrams(2)) cmp_ws = Euclidean(tokenizer=WhitespaceTokenizer()) def test_euclidean_dist_abs(self): """Test abydos.distance.Euclidean.dist_abs.""" self.assertEqual(self.cmp.dist_abs('', ''), 0) self.assertEqual(self.cmp.dist_abs('nelson', ''), 7**0.5) self.assertEqual(self.cmp.dist_abs('', 'neilsen'), 8**0.5) self.assertAlmostEqual(self.cmp.dist_abs('nelson', 'neilsen'), 7**0.5) self.assertEqual(self.cmp_q2.dist_abs('', ''), 0) self.assertEqual(self.cmp_q2.dist_abs('nelson', ''), 7**0.5) self.assertEqual(self.cmp_q2.dist_abs('', 'neilsen'), 8**0.5) self.assertAlmostEqual(self.cmp_q2.dist_abs('nelson', 'neilsen'), 7**0.5) # supplied q-gram tests self.assertEqual( self.cmp.dist_abs( QGrams().tokenize('').get_counter(), QGrams().tokenize('').get_counter(), ), 0, ) self.assertEqual( self.cmp.dist_abs( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('').get_counter(), ), 7**0.5, ) self.assertEqual( self.cmp.dist_abs( QGrams().tokenize('').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 8**0.5, ) self.assertAlmostEqual( self.cmp.dist_abs( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 7**0.5, ) # non-q-gram tests self.assertEqual(self.cmp_ws.dist_abs('', ''), 0) self.assertEqual(self.cmp_ws.dist_abs('the quick', ''), 2**0.5) self.assertEqual(self.cmp_ws.dist_abs('', 'the quick'), 2**0.5) self.assertAlmostEqual(self.cmp_ws.dist_abs(NONQ_FROM, NONQ_TO), 8**0.5) self.assertAlmostEqual(self.cmp_ws.dist_abs(NONQ_TO, NONQ_FROM), 8**0.5) # Test wrapper self.assertAlmostEqual(euclidean('nelson', 'neilsen'), 7**0.5) def test_euclidean_sim(self): """Test abydos.distance.Euclidean.sim.""" self.assertEqual(self.cmp.sim('', ''), 1) self.assertEqual(self.cmp.sim('nelson', ''), 0) self.assertEqual(self.cmp.sim('', 'neilsen'), 0) self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen'), 1 - 7**0.5 / 23**0.5) self.assertEqual(self.cmp_q2.sim('', ''), 1) self.assertEqual(self.cmp_q2.sim('nelson', ''), 0) self.assertEqual(self.cmp_q2.sim('', 'neilsen'), 0) self.assertAlmostEqual(self.cmp_q2.sim('nelson', 'neilsen'), 1 - 7**0.5 / 23**0.5) # supplied q-gram tests self.assertEqual( self.cmp.sim( QGrams().tokenize('').get_counter(), QGrams().tokenize('').get_counter(), ), 1, ) self.assertEqual( self.cmp.sim( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('').get_counter(), ), 0, ) self.assertEqual( self.cmp.sim( QGrams().tokenize('').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 0, ) self.assertAlmostEqual( self.cmp.sim( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 1 - 7**0.5 / 23**0.5, ) # non-q-gram tests self.assertEqual(self.cmp_ws.sim('', ''), 1) self.assertEqual(self.cmp_ws.sim('the quick', ''), 0) self.assertEqual(self.cmp_ws.sim('', 'the quick'), 0) self.assertAlmostEqual(self.cmp_ws.sim(NONQ_FROM, NONQ_TO), 1 - 8**0.5 / 24**0.5) self.assertAlmostEqual(self.cmp_ws.sim(NONQ_TO, NONQ_FROM), 1 - 8**0.5 / 24**0.5) # Test wrapper self.assertAlmostEqual(sim_euclidean('nelson', 'neilsen'), 1 - 7**0.5 / 23**0.5) def test_euclidean_dist(self): """Test abydos.distance.Euclidean.dist.""" self.assertEqual(self.cmp.dist('', ''), 0) self.assertEqual(self.cmp.dist('nelson', ''), 1) self.assertEqual(self.cmp.dist('', 'neilsen'), 1) self.assertAlmostEqual(self.cmp.dist('nelson', 'neilsen'), 7**0.5 / 23**0.5) self.assertEqual(self.cmp_q2.dist('', ''), 0) self.assertEqual(self.cmp_q2.dist('nelson', ''), 1) self.assertEqual(self.cmp_q2.dist('', 'neilsen'), 1) self.assertAlmostEqual(self.cmp_q2.dist('nelson', 'neilsen'), 7**0.5 / 23**0.5) # supplied q-gram tests self.assertEqual( self.cmp.dist( QGrams().tokenize('').get_counter(), QGrams().tokenize('').get_counter(), ), 0, ) self.assertEqual( self.cmp.dist( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('').get_counter(), ), 1, ) self.assertEqual( self.cmp.dist( QGrams().tokenize('').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 1, ) self.assertAlmostEqual( self.cmp.dist( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 7**0.5 / 23**0.5, ) # non-q-gram tests self.assertEqual(self.cmp_ws.dist('', ''), 0) self.assertEqual(self.cmp_ws.dist('the quick', ''), 1) self.assertEqual(self.cmp_ws.dist('', 'the quick'), 1) self.assertAlmostEqual(self.cmp_ws.dist(NONQ_FROM, NONQ_TO), 8**0.5 / 24**0.5) self.assertAlmostEqual(self.cmp_ws.dist(NONQ_TO, NONQ_FROM), 8**0.5 / 24**0.5) # Test wrapper self.assertAlmostEqual(dist_euclidean('nelson', 'neilsen'), 7**0.5 / 23**0.5)
class ManhattanTestCases(unittest.TestCase): """Test Manhattan functions. abydos.distance.Manhattan """ cmp = Manhattan() cmp_q2 = Manhattan(tokenizer=QGrams(2)) cmp_ws = Manhattan(tokenizer=WhitespaceTokenizer()) def test_manhattan_dist_abs(self): """Test abydos.distance.Manhattan.dist_abs.""" self.assertEqual(self.cmp.dist_abs('', ''), 0) self.assertEqual(self.cmp.dist_abs('nelson', ''), 7) self.assertEqual(self.cmp.dist_abs('', 'neilsen'), 8) self.assertAlmostEqual(self.cmp.dist_abs('nelson', 'neilsen'), 7) self.assertEqual(self.cmp_q2.dist_abs('', ''), 0) self.assertEqual(self.cmp_q2.dist_abs('nelson', ''), 7) self.assertEqual(self.cmp_q2.dist_abs('', 'neilsen'), 8) self.assertAlmostEqual(self.cmp_q2.dist_abs('nelson', 'neilsen'), 7) # supplied q-gram tests self.assertEqual( self.cmp.dist_abs( QGrams().tokenize('').get_counter(), QGrams().tokenize('').get_counter(), ), 0, ) self.assertEqual( self.cmp.dist_abs( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('').get_counter(), ), 7, ) self.assertEqual( self.cmp.dist_abs( QGrams().tokenize('').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 8, ) self.assertAlmostEqual( self.cmp.dist_abs( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 7, ) # non-q-gram tests self.assertEqual(self.cmp_ws.dist_abs('', ''), 0) self.assertEqual(self.cmp_ws.dist_abs('the quick', ''), 2) self.assertEqual(self.cmp_ws.dist_abs('', 'the quick'), 2) self.assertAlmostEqual(self.cmp_ws.dist_abs(NONQ_FROM, NONQ_TO), 8) self.assertAlmostEqual(self.cmp_ws.dist_abs(NONQ_TO, NONQ_FROM), 8) # Test wrapper self.assertAlmostEqual(manhattan('nelson', 'neilsen'), 7) def test_manhattan_sim(self): """Test abydos.distance.Manhattan.sim.""" self.assertEqual(self.cmp.sim('', ''), 1) self.assertEqual(self.cmp.sim('nelson', ''), 0) self.assertEqual(self.cmp.sim('', 'neilsen'), 0) self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen'), 8 / 15) self.assertEqual(self.cmp_q2.sim('', ''), 1) self.assertEqual(self.cmp_q2.sim('nelson', ''), 0) self.assertEqual(self.cmp_q2.sim('', 'neilsen'), 0) self.assertAlmostEqual(self.cmp_q2.sim('nelson', 'neilsen'), 8 / 15) # supplied q-gram tests self.assertEqual( self.cmp.sim( QGrams().tokenize('').get_counter(), QGrams().tokenize('').get_counter(), ), 1, ) self.assertEqual( self.cmp.sim( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('').get_counter(), ), 0, ) self.assertEqual( self.cmp.sim( QGrams().tokenize('').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 0, ) self.assertAlmostEqual( self.cmp.sim( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 8 / 15, ) # non-q-gram tests self.assertEqual(self.cmp_ws.sim('', ''), 1) self.assertEqual(self.cmp_ws.sim('the quick', ''), 0) self.assertEqual(self.cmp_ws.sim('', 'the quick'), 0) self.assertAlmostEqual(self.cmp_ws.sim(NONQ_FROM, NONQ_TO), 1 / 2) self.assertAlmostEqual(self.cmp_ws.sim(NONQ_TO, NONQ_FROM), 1 / 2) # Test wrapper self.assertAlmostEqual(sim_manhattan('nelson', 'neilsen'), 8 / 15) def test_manhattan_dist(self): """Test abydos.distance.Manhattan.dist.""" self.assertEqual(self.cmp.dist('', ''), 0) self.assertEqual(self.cmp.dist('nelson', ''), 1) self.assertEqual(self.cmp.dist('', 'neilsen'), 1) self.assertAlmostEqual(self.cmp.dist('nelson', 'neilsen'), 7 / 15) self.assertEqual(self.cmp_q2.dist('', ''), 0) self.assertEqual(self.cmp_q2.dist('nelson', ''), 1) self.assertEqual(self.cmp_q2.dist('', 'neilsen'), 1) self.assertAlmostEqual(self.cmp_q2.dist('nelson', 'neilsen'), 7 / 15) # supplied q-gram tests self.assertEqual( self.cmp.dist( QGrams().tokenize('').get_counter(), QGrams().tokenize('').get_counter(), ), 0, ) self.assertEqual( self.cmp.dist( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('').get_counter(), ), 1, ) self.assertEqual( self.cmp.dist( QGrams().tokenize('').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 1, ) self.assertAlmostEqual( self.cmp.dist( QGrams().tokenize('nelson').get_counter(), QGrams().tokenize('neilsen').get_counter(), ), 7 / 15, ) # non-q-gram tests self.assertEqual(self.cmp_ws.dist('', ''), 0) self.assertEqual(self.cmp_ws.dist('the quick', ''), 1) self.assertEqual(self.cmp_ws.dist('', 'the quick'), 1) self.assertAlmostEqual(self.cmp_ws.dist(NONQ_FROM, NONQ_TO), 1 / 2) self.assertAlmostEqual(self.cmp_ws.dist(NONQ_TO, NONQ_FROM), 1 / 2) # Test wrapper self.assertAlmostEqual(dist_manhattan('nelson', 'neilsen'), 7 / 15)
class CompleteLinkageTestCases(unittest.TestCase): """Test CompleteLinkage functions. abydos.distance.CompleteLinkage """ cmp = CompleteLinkage() cmp_q4 = CompleteLinkage(tokenizer=QGrams(qval=4, start_stop='')) cmp_q4_jw = CompleteLinkage(tokenizer=QGrams(qval=4, start_stop=''), metric=JaroWinkler()) def test_complete_linkage_dist(self): """Test abydos.distance.CompleteLinkage.dist.""" # Base cases self.assertEqual(self.cmp.dist('', ''), 0.0) self.assertEqual(self.cmp.dist('a', ''), 0.0) self.assertEqual(self.cmp.dist('', 'a'), 0.0) self.assertEqual(self.cmp.dist('abc', ''), 0.0) self.assertEqual(self.cmp.dist('', 'abc'), 0.0) self.assertEqual(self.cmp.dist('abc', 'abc'), 1.0) self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 1.0) self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 1.0) self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 1.0) self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 1.0) self.assertAlmostEqual(self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 1.0) self.assertEqual(self.cmp_q4.dist('AAAT', 'AATT'), 0.25) self.assertAlmostEqual(self.cmp_q4_jw.dist('AAAT', 'AATT'), 0.133333333333) def test_complete_linkage_sim(self): """Test abydos.distance.CompleteLinkage.sim.""" # Base cases self.assertEqual(self.cmp.sim('', ''), 1.0) self.assertEqual(self.cmp.sim('a', ''), 1.0) self.assertEqual(self.cmp.sim('', 'a'), 1.0) self.assertEqual(self.cmp.sim('abc', ''), 1.0) self.assertEqual(self.cmp.sim('', 'abc'), 1.0) self.assertEqual(self.cmp.sim('abc', 'abc'), 0.0) self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.0) self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.0) self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.0) self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.0) self.assertAlmostEqual(self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.0) def test_complete_linkage_dist_abs(self): """Test abydos.distance.CompleteLinkage.dist_abs.""" # Base cases self.assertEqual(self.cmp.dist_abs('', ''), float('-inf')) self.assertEqual(self.cmp.dist_abs('a', ''), float('-inf')) self.assertEqual(self.cmp.dist_abs('', 'a'), float('-inf')) self.assertEqual(self.cmp.dist_abs('abc', ''), float('-inf')) self.assertEqual(self.cmp.dist_abs('', 'abc'), float('-inf')) self.assertEqual(self.cmp.dist_abs('abc', 'abc'), 2) self.assertEqual(self.cmp.dist_abs('abcd', 'efgh'), 2) self.assertAlmostEqual(self.cmp.dist_abs('Nigel', 'Niall'), 2) self.assertAlmostEqual(self.cmp.dist_abs('Niall', 'Nigel'), 2) self.assertAlmostEqual(self.cmp.dist_abs('Colin', 'Coiln'), 2) self.assertAlmostEqual(self.cmp.dist_abs('Coiln', 'Colin'), 2) self.assertAlmostEqual(self.cmp.dist_abs('ATCAACGAGT', 'AACGATTAG'), 2)