def test_tokenize_clean_punct_false(self): result = tokenize("A written language is the representation of a language by means of a writing system. Written language is an invention in that it must be taught to children; children will pick up spoken language (oral or sign) by exposure without being specifically taught.", clean_punctuation=False, only_alphanum=False) expected = ['A', 'written', 'language', 'is', 'the', 'representation', 'of', 'a', 'language', 'by', 'means', 'of', 'a', 'writing', 'system', '.', 'Written', 'language', 'is', 'an', 'invention', 'in', 'that', 'it', 'must', 'be', 'taught', 'to', 'children', ';', 'children', 'will', 'pick', 'up', 'spoken', 'language', '(', 'oral', 'or', 'sign', ')', 'by', 'exposure', 'without', 'being', 'specifically', 'taught', '.'] self.assertEquals(expected, result)
def set_number_of_tokens(): from includes.tokenizer import tokenize from pymongo import MongoClient client = MongoClient() db = client[config.DATABASE_NAME] for document in db.document.find(): document['number_of_words'] = len(tokenize(document['text'])) db.document.save(document)
def test_tokenize_clean_stop_words(self): result = tokenize( "A written language is the representation of a language by means of a writing system. Written language is an invention in that it must be taught to children; children will pick up spoken language (oral or sign) by exposure without being specifically taught.", clean_stop_words=True) expected = [ 'written', 'language', 'representation', 'language', 'means', 'writing', 'system', 'Written', 'language', 'invention', 'must', 'taught', 'children', 'children', 'pick', 'spoken', 'language', 'oral', 'sign', 'exposure', 'without', 'specifically', 'taught' ] self.assertEquals(expected, result)
def test_moby_dick_window(self): #just make sure we window_sizes = xrange(100, 6000, 100) text = gutenberg.raw('melville-moby_dick.txt') tokens = tokenize(text, only_alphanum=True, clean_punctuation=True) total_number_of_tokens = len(tokens) for window_size in window_sizes: count = 0 number_of_windows = int(math.ceil( total_number_of_tokens / window_size)) for current_window in range(0, number_of_windows+1): word_window = Window(tokens, window_size, current_window) for word in word_window: count += 1 self.assertEquals(count, total_number_of_tokens)
def test_tokenize_happy_path(self): result = tokenize( "A written language is the representation of a language by means of a writing system. Written language is an invention in that it must be taught to children; children will pick up spoken language (oral or sign) by exposure without being specifically taught." ) expected = [ 'written', 'language', 'is', 'the', 'representation', 'of', 'language', 'by', 'means', 'of', 'writing', 'system', 'Written', 'language', 'is', 'an', 'invention', 'in', 'that', 'it', 'must', 'be', 'taught', 'to', 'children', 'children', 'will', 'pick', 'up', 'spoken', 'language', 'oral', 'or', 'sign', 'by', 'exposure', 'without', 'being', 'specifically', 'taught' ] self.assertEquals(expected, result)
def test_moby_dick_window(self): #just make sure we window_sizes = xrange(100, 6000, 100) text = gutenberg.raw('melville-moby_dick.txt') tokens = tokenize(text, only_alphanum=True, clean_punctuation=True) total_number_of_tokens = len(tokens) for window_size in window_sizes: count = 0 number_of_windows = int( math.ceil(total_number_of_tokens / window_size)) for current_window in range(0, number_of_windows + 1): word_window = Window(tokens, window_size, current_window) for word in word_window: count += 1 self.assertEquals(count, total_number_of_tokens)
def test_tokenize_clean_punct_false(self): result = tokenize( "A written language is the representation of a language by means of a writing system. Written language is an invention in that it must be taught to children; children will pick up spoken language (oral or sign) by exposure without being specifically taught.", clean_punctuation=False, only_alphanum=False) expected = [ 'A', 'written', 'language', 'is', 'the', 'representation', 'of', 'a', 'language', 'by', 'means', 'of', 'a', 'writing', 'system', '.', 'Written', 'language', 'is', 'an', 'invention', 'in', 'that', 'it', 'must', 'be', 'taught', 'to', 'children', ';', 'children', 'will', 'pick', 'up', 'spoken', 'language', '(', 'oral', 'or', 'sign', ')', 'by', 'exposure', 'without', 'being', 'specifically', 'taught', '.' ] self.assertEquals(expected, result)
def tokenizer_wrapper(raw_text): return map( str.lower, tokenize(raw_text, only_alphanum=True, clean_punctuation=True))
def test_tokenize_only_alpha(self): result = tokenize("123 lalala 123 pepepe 4566 sarasaa", only_alpha = True) expected = ['lalala', 'pepepe', 'sarasaa'] self.assertEquals(expected, result)
def test_tokenize_clean_stop_words(self): result = tokenize("A written language is the representation of a language by means of a writing system. Written language is an invention in that it must be taught to children; children will pick up spoken language (oral or sign) by exposure without being specifically taught.", clean_stop_words=True) expected = ['written', 'language', 'representation', 'language', 'means', 'writing', 'system', 'Written', 'language', 'invention', 'must', 'taught', 'children', 'children', 'pick', 'spoken', 'language', 'oral', 'sign', 'exposure', 'without', 'specifically', 'taught'] self.assertEquals(expected, result)
def test_tokenize_happy_path(self): result = tokenize("A written language is the representation of a language by means of a writing system. Written language is an invention in that it must be taught to children; children will pick up spoken language (oral or sign) by exposure without being specifically taught.") expected = ['written', 'language', 'is', 'the', 'representation', 'of', 'language', 'by', 'means', 'of', 'writing', 'system', 'Written', 'language', 'is', 'an', 'invention', 'in', 'that', 'it', 'must', 'be', 'taught', 'to', 'children', 'children', 'will', 'pick', 'up', 'spoken', 'language', 'oral', 'or', 'sign', 'by', 'exposure', 'without', 'being', 'specifically', 'taught'] self.assertEquals(expected, result)
def get_moby_dick_tokens(): moby_dick = gutenberg.raw('melville-moby_dick.txt') tokens = tokenize(moby_dick, only_alphanum=True, clean_punctuation=True) return [token.lower() for token in tokens]
def test_tokenize_only_alpha(self): result = tokenize("123 lalala 123 pepepe 4566 sarasaa", only_alpha=True) expected = ['lalala', 'pepepe', 'sarasaa'] self.assertEquals(expected, result)
def tokenizer_wrapper(raw_text): return map(str.lower, map(str, tokenize(raw_text, only_alphanum=True, clean_punctuation=True)))