def testTokenSimple(self):
     r1 = tokenizer().tokenize(u'This is a fox')
     r2 = [u'this', u'is', u'a', u'fox']
     assert r1 == r2
 def testTokenUnicode(self):
     r1 = tokenizer().tokenize(u'This is a föx')
     r2 = [u'this', u'is', u'a', u'föx']
     assert r1 == r2
 def testTokenInvalidInput(self):
     r1 = tokenizer().tokenize(u'This ### ##b ##s +++ *** \' !§$%x&/()=? " "')
     r2 = [u'this', u'x']
     assert r1 == r2
 def testTokenUrl(self):
     r1 = tokenizer().tokenize(u'This is a http://fox.com fox news')
     r2 = [u'this', u'is', u'a',u'fox',u'news']
     assert r1 == r2
 def testTokenCharFold(self):
     r1 = tokenizer().tokenize(u'This is a fooooooox')
     r2 = [u'this', u'is', u'a', u'foox']
     assert r1 == r2
 def testTokenHash(self):
     r1 = tokenizer().tokenize(u'This is a #fox')
     r2 = [u'this', u'is', u'a']
     assert r1 == r2
 def testTokenUsername(self):
     r1 = tokenizer().tokenize(u'This is a @fox')
     r2 = [u'this', u'is', u'a']
     assert r1 == r2
 def testTokenEmos(self):
     r1 = tokenizer().tokenize(u'This is a fox :) :(')
     r2 = [u'this', u'is', u'a', u'fox', u'##s##', u'##b##']
     assert r1 == r2