def test_glyphs(self): """Test the correct detection and rendering of glyphs. The characters in the resulting token should be the characters that are the content of the g tag. """ testdata = {'յեգի<g xmlns="http://www.tei-c.org/ns/1.0" ref="#պտ"/>ոս': 'յեգիոս', 'յ<g xmlns="http://www.tei-c.org/ns/1.0" ref="աշխարհ">աշխար</g>հն': 'յաշխարհն', '<g xmlns="http://www.tei-c.org/ns/1.0" ref="">աշխարհ</g>ին': 'աշխարհին', 'ար<g xmlns="http://www.tei-c.org/ns/1.0" ref="">ա</g>պ<lb xmlns="http://www.tei-c.org/ns/1.0" xml:id="l101276841" n="14"/>կաց': 'արապկաց', '<g xmlns="http://www.tei-c.org/ns/1.0" ref="">աշխարհ</g>ն': 'աշխարհն'} testdata_special = {'յեգի<g xmlns="http://www.tei-c.org/ns/1.0" ref="#ptlig">պտ</g>ոս': {'token': 'յեգիպտոս', 'occurrence': 1}, 'յ<g xmlns="http://www.tei-c.org/ns/1.0" ref="#asxarh">աշխար</g>հն': {'token': 'յաշխարհն', 'occurrence': 1}, '<g xmlns="http://www.tei-c.org/ns/1.0" ref="#asxarh">աշխարհ</g>ին': {'token': 'աշխարհին', 'occurrence': 2}, 'ար<g xmlns="http://www.tei-c.org/ns/1.0" ref="#avar">ա</g>պ<lb xmlns="http://www.tei-c.org/ns/1.0" xml:id="l101276841" n="14"/>կաց': {'token': 'արապկաց', 'occurrence': 1}, '<g xmlns="http://www.tei-c.org/ns/1.0" ref="#asxarh">աշխարհ</g>ն': {'token': 'աշխարհն', 'occurrence': 1}} tokens = wordtokenize.from_etree(self.testdoc) # Find the token that has our substitution for t in tokens: if '<g xmlns="http://www.tei-c.org/ns/1.0" ref="' in t['lit']: self.assertIsNotNone(testdata.get(t['lit']), "Error in rendering glyphs (input data not covered by testdata)") self.assertTrue(t['t'] == testdata.get(t['lit']), "Error in rendering glyphs") del testdata[t['lit']] self.assertEqual(len(testdata), 0, "Did not find any test token") tokens = wordtokenize.from_etree(self.testdoc_special) # Find the token that has our substitution for t in tokens: if '<g xmlns="http://www.tei-c.org/ns/1.0" ref="' in t['lit']: self.assertIsNotNone(testdata_special.get(t['lit']), "Error in rendering glyphs (input data not covered by testdata)") self.assertTrue(t['t'] == testdata_special.get(t['lit'])['token'], "Error in rendering glyphs") testdata_special[t['lit']]['occurrence'] -= 1 if testdata_special[t['lit']]['occurrence'] == 0: del testdata_special[t['lit']] self.assertEqual(len(testdata_special), 0, "Did not find any test token")
def test_substitution_layer(self): """Test that the first_layer option works correctly.""" tokens = wordtokenize.from_etree(self.testdoc, first_layer=True) # Find the token that has our substitution for t in tokens: if t['lit'] != 'դե<del xmlns="http://www.tei-c.org/ns/1.0">ղ</del>ևս': continue self.assertEqual(t['t'], 'դեղևս') break else: self.assertTrue(False, "Did not find the testing token")
def test_substitution(self): """Test that the correct words are picked out of a subst tag.""" tokens = wordtokenize.from_etree(self.testdoc) # Find the token that has our substitution for t in tokens: if t['lit'] != 'դե<add xmlns="http://www.tei-c.org/ns/1.0">ռ</add>ևս': continue self.assertEqual(t['t'], 'դեռևս') break else: self.assertTrue(False, "Did not find the testing token")