def testGerman(self): SP = Splitter(singlechar=1) self._test(SP, 'der bäcker Ging über die Brücke', ['der','bäcker','ging','über','die','brücke']) self._test(SP, 'der äücker Ging über die Brücke', ['der','äücker','ging','über','die','brücke'])
def testDisabledCaseFolding(self): SP = Splitter(casefolding=0) self._test(SP, '', []) self._test(SP, 'foo', ['foo']) self._test(SP, 'foo', ['foo']) self._test(SP, ' Foo ', ['Foo']) self._test(SP, ' Foo Bar', ['Foo','Bar']) self._test(SP, ' foo Bar ', ['foo','Bar'])
def testEnabledCaseFolding(self): SP = Splitter(casefolding=1) self._test(SP, '', []) self._test(SP, 'foo', ['foo']) self._test(SP, 'foo', ['foo']) self._test(SP, ' Foo ', ['foo']) self._test(SP, ' Foo Bar', ['foo','bar']) self._test(SP, ' foo Bar ', ['foo','bar'])
def testSimple(self): SP = Splitter() self._test(SP, '', []) self._test(SP, 'foo', ['foo']) self._test(SP, 'foo', ['foo']) self._test(SP, ' foo ', ['foo']) self._test(SP, ' foo bar', ['foo','bar']) self._test(SP, ' foo bar ', ['foo','bar']) self._test(SP, ' foo 23 25 bar ', ['foo','23','25','bar'])
def testParagraphs(self): SP = Splitter(singlechar=1, separator='§') self._test(SP, 'dies ist §8 b b§b', ['dies', 'ist', '§8', 'b', 'b§b'])
def testSwedish(self): SP = Splitter(singlechar=1) self._test(SP, 'åke vill ju inte alls leka med mig.', ['åke','vill','ju','inte','alls','leka','med','mig'])
def testSingleChars(self): SP = Splitter(singlechar=1) self._test(SP, 'ab a b c', ['ab','a','b','c']) self._test(SP, 'foo 2 2 bar ', ['foo','2','2','bar'])
def testSeparator2(self): SP = Splitter(separator=".") self._test(SP, 'end 12.12 line', ['end','12.12','line']) self._test(SP, 'end of. line.foo end.', ['end','of','line.foo','end']) self._test(SP, 'end of. line', ['end','of','line'])
def testSeparator1(self): SP = Splitter(separator=".-") self._test(SP, 'foo 22.33 bar', ['foo','22.33','bar']) self._test(SP, 'foo 22-33 bar', ['foo','22-33','bar']) self._test(SP, 'foo 22_33 bar', ['foo','22','33','bar'])
def testMaxlen(self): SP = Splitter(maxlen=5) self._test(SP, 'abcdefg foo barfoo', ['abcde','foo','barfo']) self._test(SP, 'abcdefg'*2000, ['abcde'])
def testSingleCharComparisonPoisoning(self): SP = Splitter() self.assertEqual(u'D', 'D') SP.split(u'D') self.assertEqual(u'D', 'D')
def testSingleCharCachePoisoning(self): SP = Splitter() SP.split(u'D') self.assertEqual(u'D', unicode('D'))