def test_word_tokenize_stopwords(): assert generator_cmp(word_tokenize('This is a lot of stopwords'), ['lot', 'stopwords']) test_case = 'I should get an empty list' assert generator_cmp(word_tokenize(test_case, test_case.split()), []) assert generator_cmp(word_tokenize(test_case, []), ['should', 'get', 'an', 'empty', 'list'])
def test_word_tokenize_remove_urls(): assert generator_cmp( word_tokenize('This is a www.google.com hello', remove_urls=True), ['hello']) assert generator_cmp( word_tokenize('This is another maps.google.com without', remove_urls=False), ['another', 'mapsgooglecom', 'without'])
def test_word_tokenize_remove_urls(): assert generator_cmp( word_tokenize('This is a www.google.com hello', remove_urls=True), ['hello'] ) assert generator_cmp( word_tokenize('This is another maps.google.com without', remove_urls=False), ['another', 'mapsgooglecom', 'without'] )
def test_word_tokenize(): assert generator_cmp(word_tokenize('Hello cruel world'), ['hello', 'cruel', 'world']) assert generator_cmp(word_tokenize(''), []) assert generator_cmp(word_tokenize('empty +@@ punctuation'), ['empty', 'punctuation']) assert generator_cmp(word_tokenize('This shouldn\'t fail'), ['shouldnt', 'fail']) assert generator_cmp(word_tokenize('Cat and dog'), ['cat', 'dog']) assert generator_cmp(word_tokenize('I own a Dell laptop'), ['dell', 'laptop']) # Regression test
def test_word_tokenize_large_whitespace(): assert generator_cmp(word_tokenize('This \n is \r a \ttest'), ['test'])
def test_word_tokenize_punctuation(): # Punctuation should always be removed from front and back assert generator_cmp(word_tokenize('!My name is Michael!'), ['name', 'michael'])
def test_word_tokenize_digits(): # Pure digits should be ignored but combinations of digits and letters should be included assert generator_cmp(word_tokenize('gumball800 is cool'), ['gumball800', 'cool']) assert generator_cmp(word_tokenize('90 + ten'), ['ten'])
def test_word_tokenize_single_letters(): # Single letter tokens should be completely ignored assert generator_cmp(word_tokenize('a e i o u vowels', []), ['vowels']) assert generator_cmp(word_tokenize('!!!@#@##@#I *a Gold', []), ['gold']) assert generator_cmp(word_tokenize('aa i', []), ['aa'])
def test_word_tokenize_special_punctuation(): assert generator_cmp(word_tokenize('self-determination'), ['self', 'determination']) assert generator_cmp(word_tokenize('Red/Green'), ['red', 'green']) assert generator_cmp(word_tokenize('Red\\Green'), ['red', 'green'])