def test_tokenize_clean_punct_false(self):

        result = tokenize("A written language is the representation of a language by means of a writing system. Written language is an invention in that it must be taught to children; children will pick up spoken language (oral or sign) by exposure without being specifically taught.", clean_punctuation=False, only_alphanum=False)

        expected = ['A', 'written', 'language', 'is', 'the', 'representation', 'of', 'a', 'language', 'by', 'means', 'of', 'a', 'writing', 'system', '.', 'Written', 'language', 'is', 'an', 'invention', 'in', 'that', 'it', 'must', 'be', 'taught', 'to', 'children', ';', 'children', 'will', 'pick', 'up', 'spoken', 'language', '(', 'oral', 'or', 'sign', ')', 'by', 'exposure', 'without', 'being', 'specifically', 'taught', '.']

        self.assertEquals(expected, result)
示例#2
0
def set_number_of_tokens():
    from includes.tokenizer import tokenize
    from pymongo import MongoClient
    client = MongoClient()
    db = client[config.DATABASE_NAME]

    for document in db.document.find():
        document['number_of_words'] = len(tokenize(document['text']))
        db.document.save(document)
示例#3
0
def set_number_of_tokens():
    from includes.tokenizer import tokenize
    from pymongo import MongoClient
    client = MongoClient()
    db = client[config.DATABASE_NAME]

    for document in db.document.find():
        document['number_of_words'] = len(tokenize(document['text']))
        db.document.save(document)
示例#4
0
    def test_tokenize_clean_stop_words(self):
        result = tokenize(
            "A written language is the representation of a language by means of a writing system. Written language is an invention in that it must be taught to children; children will pick up spoken language (oral or sign) by exposure without being specifically taught.",
            clean_stop_words=True)

        expected = [
            'written', 'language', 'representation', 'language', 'means',
            'writing', 'system', 'Written', 'language', 'invention', 'must',
            'taught', 'children', 'children', 'pick', 'spoken', 'language',
            'oral', 'sign', 'exposure', 'without', 'specifically', 'taught'
        ]
        self.assertEquals(expected, result)
 def test_moby_dick_window(self):
     #just make sure we
     window_sizes = xrange(100, 6000, 100)
     text = gutenberg.raw('melville-moby_dick.txt')
     tokens = tokenize(text, only_alphanum=True, clean_punctuation=True)
     total_number_of_tokens = len(tokens)
     for window_size in window_sizes:
         count = 0
         number_of_windows = int(math.ceil( total_number_of_tokens / window_size))
         for current_window in range(0, number_of_windows+1):
             word_window = Window(tokens, window_size, current_window)
             for word in word_window:
                 count += 1
         self.assertEquals(count, total_number_of_tokens)
示例#6
0
    def test_tokenize_happy_path(self):
        result = tokenize(
            "A written language is the representation of a language by means of a writing system. Written language is an invention in that it must be taught to children; children will pick up spoken language (oral or sign) by exposure without being specifically taught."
        )

        expected = [
            'written', 'language', 'is', 'the', 'representation', 'of',
            'language', 'by', 'means', 'of', 'writing', 'system', 'Written',
            'language', 'is', 'an', 'invention', 'in', 'that', 'it', 'must',
            'be', 'taught', 'to', 'children', 'children', 'will', 'pick', 'up',
            'spoken', 'language', 'oral', 'or', 'sign', 'by', 'exposure',
            'without', 'being', 'specifically', 'taught'
        ]

        self.assertEquals(expected, result)
示例#7
0
 def test_moby_dick_window(self):
     #just make sure we
     window_sizes = xrange(100, 6000, 100)
     text = gutenberg.raw('melville-moby_dick.txt')
     tokens = tokenize(text, only_alphanum=True, clean_punctuation=True)
     total_number_of_tokens = len(tokens)
     for window_size in window_sizes:
         count = 0
         number_of_windows = int(
             math.ceil(total_number_of_tokens / window_size))
         for current_window in range(0, number_of_windows + 1):
             word_window = Window(tokens, window_size, current_window)
             for word in word_window:
                 count += 1
         self.assertEquals(count, total_number_of_tokens)
示例#8
0
    def test_tokenize_clean_punct_false(self):

        result = tokenize(
            "A written language is the representation of a language by means of a writing system. Written language is an invention in that it must be taught to children; children will pick up spoken language (oral or sign) by exposure without being specifically taught.",
            clean_punctuation=False,
            only_alphanum=False)

        expected = [
            'A', 'written', 'language', 'is', 'the', 'representation', 'of',
            'a', 'language', 'by', 'means', 'of', 'a', 'writing', 'system',
            '.', 'Written', 'language', 'is', 'an', 'invention', 'in', 'that',
            'it', 'must', 'be', 'taught', 'to', 'children', ';', 'children',
            'will', 'pick', 'up', 'spoken', 'language', '(', 'oral', 'or',
            'sign', ')', 'by', 'exposure', 'without', 'being', 'specifically',
            'taught', '.'
        ]

        self.assertEquals(expected, result)
示例#9
0
 def tokenizer_wrapper(raw_text):
     return map(
         str.lower,
         tokenize(raw_text, only_alphanum=True, clean_punctuation=True))
 def test_tokenize_only_alpha(self):
     result = tokenize("123 lalala 123 pepepe 4566 sarasaa", only_alpha = True)
     expected = ['lalala', 'pepepe', 'sarasaa']
     self.assertEquals(expected, result)
    def test_tokenize_clean_stop_words(self):
        result = tokenize("A written language is the representation of a language by means of a writing system. Written language is an invention in that it must be taught to children; children will pick up spoken language (oral or sign) by exposure without being specifically taught.", clean_stop_words=True)

        expected = ['written', 'language', 'representation', 'language', 'means', 'writing', 'system', 'Written', 'language', 'invention', 'must', 'taught', 'children', 'children', 'pick', 'spoken', 'language', 'oral', 'sign', 'exposure', 'without', 'specifically', 'taught']
        self.assertEquals(expected, result)
    def test_tokenize_happy_path(self):
        result = tokenize("A written language is the representation of a language by means of a writing system. Written language is an invention in that it must be taught to children; children will pick up spoken language (oral or sign) by exposure without being specifically taught.")

        expected = ['written', 'language', 'is', 'the', 'representation', 'of', 'language', 'by', 'means', 'of', 'writing', 'system', 'Written', 'language', 'is', 'an', 'invention', 'in', 'that', 'it', 'must', 'be', 'taught', 'to', 'children', 'children', 'will', 'pick', 'up', 'spoken', 'language', 'oral', 'or', 'sign', 'by', 'exposure', 'without', 'being', 'specifically', 'taught']

        self.assertEquals(expected, result)
示例#13
0
def get_moby_dick_tokens():
    moby_dick = gutenberg.raw('melville-moby_dick.txt')
    tokens = tokenize(moby_dick, only_alphanum=True, clean_punctuation=True)
    return [token.lower() for token in tokens]
示例#14
0
 def test_tokenize_only_alpha(self):
     result = tokenize("123 lalala 123 pepepe 4566 sarasaa",
                       only_alpha=True)
     expected = ['lalala', 'pepepe', 'sarasaa']
     self.assertEquals(expected, result)
 def tokenizer_wrapper(raw_text):
     return map(str.lower, map(str, tokenize(raw_text, only_alphanum=True, clean_punctuation=True)))