def test_bug2785373(self): """Testcases for bug #2785373""" input = "So, one dey when I wes 17, I left." for _ in tokenize_en(input): pass input = raw_unicode("So, one dey when I wes 17, I left.") for _ in tokenize_en(input): pass
def test_bug2785373(): """Testcases for bug #2785373""" input = "So, one dey when I wes 17, I left." for _ in tokenize_en(input): pass input = "So, one dey when I wes 17, I left." for _ in tokenize_en(input): pass
def test_bug1591450(self): """Check for tokenization regressions identified in bug #1591450.""" input = """Testing <i>markup</i> and {y:i}so-forth...leading dots and trail--- well, you get-the-point. Also check numbers: 999 1,000 12:00 .45. Done?""" output = [ ("Testing", 0), ("i", 9), ("markup", 11), ("i", 19), ("and", 22), ("y", 27), ("i", 29), ("so", 31), ("forth", 34), ("leading", 42), ("dots", 50), ("and", 55), ("trail", 59), ("well", 68), ("you", 74), ("get", 78), ("the", 82), ("point", 86), ("Also", 93), ("check", 98), ("numbers", 104), ("Done", 134), ] for (itmO, itmV) in zip(output, tokenize_en(input)): self.assertEqual(itmO, itmV)
def test_bug1591450(): """Check for tokenization regressions identified in bug #1591450.""" input = """Testing <i>markup</i> and {y:i}so-forth...leading dots and trail--- well, you get-the-point. Also check numbers: 999 1,000 12:00 .45. Done?""" output = [ ("Testing", 0), ("i", 9), ("markup", 11), ("i", 19), ("and", 22), ("y", 27), ("i", 29), ("so", 31), ("forth", 34), ("leading", 42), ("dots", 50), ("and", 55), ("trail", 59), ("well", 68), ("you", 74), ("get", 78), ("the", 82), ("point", 86), ("Also", 93), ("check", 98), ("numbers", 104), ("Done", 134), ] for (itmO, itmV) in zip(output, tokenize_en(input)): assert itmO == itmV
def test_unicodeCombining(self): """Test tokenization with unicode combining symbols.""" input = raw_unicode(r"Ik ben gei\u0308nteresseerd in de co\u00F6rdinatie van mijn knie\u00EBn, maar kan niet e\u0301e\u0301n \u00E0 twee enqu\u00EAtes vinden die recht doet aan mijn carri\u00E8re op Cura\u00E7ao") output = input.split(" ") output[8] = output[8][0:-1] for (itmO,itmV) in zip(output,tokenize_en(input)): self.assertEqual(itmO,itmV[0]) self.assertTrue(input[itmV[1]:].startswith(itmO))
def test_unicodeBasic(self): """Test tokenization of a basic unicode string.""" input = raw_unicode(r"Ik ben ge\u00EFnteresseerd in de co\u00F6rdinatie van mijn knie\u00EBn, maar kan niet \u00E9\u00E9n \u00E0 twee enqu\u00EAtes vinden die recht doet aan mijn carri\u00E8re op Cura\u00E7ao") output = input.split(" ") output[8] = output[8][0:-1] for (itmO,itmV) in zip(output,tokenize_en(input)): self.assertEqual(itmO,itmV[0]) self.assertTrue(input[itmV[1]:].startswith(itmO))
def test_unicodeBasic(): """Test tokenization of a basic unicode string.""" input = "Ik ben geïnteresseerd in de coördinatie van mijn knieën, maar kan niet één à twee enquêtes vinden die recht doet aan mijn carrière op Curaçao" output = input.split(" ") output[8] = output[8][0:-1] for (itmO, itmV) in zip(output, tokenize_en(input)): assert itmO == itmV[0] assert input[itmV[1]:].startswith(itmO)
def test_utf8_bytes_at_end(self): """Test tokenization of UTF8-encoded bytes at end of word.""" # Python3 doesn't support bytestrings, don't run this test if str is unicode: return input = "A r\xc3\xa9sum\xc3\xa9, also spelled resum\xc3\xa9 or resume" output = input.split(" ") output[1] = output[1][0:-1] for (itmO,itmV) in zip(output,tokenize_en(input)): self.assertEqual(itmO,itmV[0])
def test_finnish_text(self): """Test tokenizing some Finnish text. This really should work since there are no special rules to apply, just lots of non-ascii characters. """ inputT = raw_unicode('T\\xe4m\\xe4 on kappale. Eip\\xe4 ole kovin 2 nen, mutta tarkoitus on n\\xe4ytt\\xe4\\xe4 miten sanastaja \\ntoimii useiden-erilaisten sanarypp\\xe4iden kimpussa.\\nPit\\xe4\\xe4p\\xe4 viel\\xe4 \'tarkistaa\' sanat jotka "lainausmerkeiss\\xe4". Heittomerkki ja vaa\'an.\\nUlkomaisia sanoja s\\xfcss, spa\\xdf.') outputT = [ (raw_unicode('T\\xe4m\\xe4'),0), (raw_unicode('on'),5), (raw_unicode('kappale'),8), (raw_unicode('Eip\\xe4'),17), (raw_unicode('ole'),22), (raw_unicode('kovin'),26), (raw_unicode('nen'),34), (raw_unicode('mutta'),39), (raw_unicode('tarkoitus'),45), (raw_unicode('on'),55), (raw_unicode('n\\xe4ytt\\xe4\\xe4'),58), (raw_unicode('miten'),66), (raw_unicode('sanastaja'),72), (raw_unicode('toimii'),83), (raw_unicode('useiden'),90), (raw_unicode('erilaisten'),98), (raw_unicode('sanarypp\\xe4iden'),109), (raw_unicode('kimpussa'),123), (raw_unicode('Pit\\xe4\\xe4p\\xe4'),133), (raw_unicode('viel\\xe4'),141), (raw_unicode('tarkistaa'),148), (raw_unicode('sanat'),159), (raw_unicode('jotka'),165), (raw_unicode('lainausmerkeiss\\xe4'),172), (raw_unicode('Heittomerkki'),191), (raw_unicode('ja'),204), (raw_unicode("vaa'an"),207), (raw_unicode('Ulkomaisia'),215), (raw_unicode('sanoja'),226), (raw_unicode('s\\xfcss'),233), (raw_unicode('spa\\xdf'),239),] for (itmO,itmV) in zip(outputT,tokenize_en(inputT)): self.assertEqual(itmO,itmV)
def test_utf8_bytes(self): """Test tokenization of UTF8-encoded bytes (bug #2500184).""" # Python3 doesn't support bytestrings, don't run this test if str is unicode: return input = "A r\xc3\xa9sum\xc3\xa9, also spelled resum\xc3\xa9 or resume" output = input.split(" ") output[1] = output[1][0:-1] for (itmO,itmV) in zip(output,tokenize_en(input)): self.assertEqual(itmO,itmV[0]) self.assertTrue(input[itmV[1]:].startswith(itmO))
def test_utf8_bytes_in_an_array(self): """Test tokenization of UTF8-encoded bytes stored in an array.""" # Python3 doesn't support bytestrings, don't run this test if str is unicode: return input = "A r\xc3\xa9sum\xc3\xa9, also spelled resum\xc3\xa9 or resume" output = input.split(" ") output[1] = output[1][0:-1] input = array.array('c',input) output = [array.array('c',w) for w in output] for (itmO,itmV) in zip(output,tokenize_en(array.array('c',input))): self.assertEqual(itmO,itmV[0]) self.assertEqual(input[itmV[1]:itmV[1]+len(itmV[0])],itmO)
def test_tokenize_en(self): """Simple regression test for English tokenization.""" input = """This is a paragraph. It's not very special, but it's designed 2 show how the splitter works with many-different combos of words. Also need to "test" the handling of 'quoted' words.""" output = [("This", 0), ("is", 5), ("a", 8), ("paragraph", 10), ("It's", 22), ("not", 27), ("very", 31), ("special", 36), ("but", 45), ("it's", 49), ("designed", 54), ("show", 65), ("how", 70), ("the", 74), ("splitter", 78), ("works", 87), ("with", 93), ("many", 98), ("different", 103), ("combos", 113), ("of", 120), ("words", 123), ("Also", 130), ("need", 135), ("to", 140), ("test", 144), ("the", 150), ("handling", 154), ("of", 163), ("quoted", 167), ("words", 175)] for (itmO, itmV) in zip(output, tokenize_en(input)): self.assertEqual(itmO, itmV)
def test_finnish_text(): """Test tokenizing some Finnish text. This really should work since there are no special rules to apply, just lots of non-ascii characters. """ text = textwrap.dedent("""\ Tämä on kappale. Eipä ole kovin 2 nen, mutta tarkoitus on näyttää miten sanastaja toimii useiden-erilaisten sanaryppäiden kimpussa. Pitääpä vielä 'tarkistaa' sanat jotka "lainausmerkeissä". Heittomerkki ja vaa'an. Ulkomaisia sanoja süss, spaß. """) expected_tokens = [ ("Tämä", 0), ("on", 5), ("kappale", 8), ("Eipä", 17), ("ole", 22), ("kovin", 26), ("nen", 34), ("mutta", 39), ("tarkoitus", 45), ("on", 55), ("näyttää", 58), ("miten", 66), ("sanastaja", 72), ("toimii", 83), ("useiden", 90), ("erilaisten", 98), ("sanaryppäiden", 109), ("kimpussa", 123), ("Pitääpä", 133), ("vielä", 141), ("tarkistaa", 148), ("sanat", 159), ("jotka", 165), ("lainausmerkeissä", 172), ("Heittomerkki", 191), ("ja", 204), ("vaa'an", 207), ("Ulkomaisia", 215), ("sanoja", 226), ("süss", 233), ("spaß", 239), ] assert list(tokenize_en(text)) == expected_tokens
def test_tokenize_en(self): """Simple regression test for English tokenization.""" input = """This is a paragraph. It's not very special, but it's designed 2 show how the splitter works with many-different combos of words. Also need to "test" the handling of 'quoted' words.""" output = [ ("This",0),("is",5),("a",8),("paragraph",10),("It's",22), ("not",27),("very",31),("special",36),("but",45),("it's",49), ("designed",54),("show",65),("how",70),("the",74), ("splitter",78),("works",87),("with",93),("many",98), ("different",103),("combos",113),("of",120),("words",123), ("Also",130),("need",135), ("to",140),("test",144),("the",150),("handling",154), ("of",163),("quoted",167),("words",175) ] for (itmO,itmV) in zip(output,tokenize_en(input)): self.assertEqual(itmO,itmV)
def test_finnish_text(self): """Test tokenizing some Finnish text. This really should work since there are no special rules to apply, just lots of non-ascii characters. """ inputT = raw_unicode( "T\\xe4m\\xe4 on kappale. Eip\\xe4 ole kovin 2 nen, mutta tarkoitus on n\\xe4ytt\\xe4\\xe4 miten sanastaja \\ntoimii useiden-erilaisten sanarypp\\xe4iden kimpussa.\\nPit\\xe4\\xe4p\\xe4 viel\\xe4 'tarkistaa' sanat jotka \"lainausmerkeiss\\xe4\". Heittomerkki ja vaa'an.\\nUlkomaisia sanoja s\\xfcss, spa\\xdf." ) outputT = [ (raw_unicode("T\\xe4m\\xe4"), 0), (raw_unicode("on"), 5), (raw_unicode("kappale"), 8), (raw_unicode("Eip\\xe4"), 17), (raw_unicode("ole"), 22), (raw_unicode("kovin"), 26), (raw_unicode("nen"), 34), (raw_unicode("mutta"), 39), (raw_unicode("tarkoitus"), 45), (raw_unicode("on"), 55), (raw_unicode("n\\xe4ytt\\xe4\\xe4"), 58), (raw_unicode("miten"), 66), (raw_unicode("sanastaja"), 72), (raw_unicode("toimii"), 83), (raw_unicode("useiden"), 90), (raw_unicode("erilaisten"), 98), (raw_unicode("sanarypp\\xe4iden"), 109), (raw_unicode("kimpussa"), 123), (raw_unicode("Pit\\xe4\\xe4p\\xe4"), 133), (raw_unicode("viel\\xe4"), 141), (raw_unicode("tarkistaa"), 148), (raw_unicode("sanat"), 159), (raw_unicode("jotka"), 165), (raw_unicode("lainausmerkeiss\\xe4"), 172), (raw_unicode("Heittomerkki"), 191), (raw_unicode("ja"), 204), (raw_unicode("vaa'an"), 207), (raw_unicode("Ulkomaisia"), 215), (raw_unicode("sanoja"), 226), (raw_unicode("s\\xfcss"), 233), (raw_unicode("spa\\xdf"), 239), ] for (itmO, itmV) in zip(outputT, tokenize_en(inputT)): self.assertEqual(itmO, itmV)
def test_typographic_apostrophe(): """"Typographic apostrophes should be word separators in English.""" text = "They\u2019re here" expected_tokens = [("They", 0), ("re", 5), ("here", 8)] assert list(tokenize_en(text)) == expected_tokens