Python tokenize示例，includes.tokenizer.tokenize Python示例

示例#1

0

显示文件

文件： test_includes.py 项目： finiteautomata/leninanalysis

    def test_tokenize_clean_punct_false(self):

        result = tokenize("A written language is the representation of a language by means of a writing system. Written language is an invention in that it must be taught to children; children will pick up spoken language (oral or sign) by exposure without being specifically taught.", clean_punctuation=False, only_alphanum=False)

        expected = ['A', 'written', 'language', 'is', 'the', 'representation', 'of', 'a', 'language', 'by', 'means', 'of', 'a', 'writing', 'system', '.', 'Written', 'language', 'is', 'an', 'invention', 'in', 'that', 'it', 'must', 'be', 'taught', 'to', 'children', ';', 'children', 'will', 'pick', 'up', 'spoken', 'language', '(', 'oral', 'or', 'sign', ')', 'by', 'exposure', 'without', 'being', 'specifically', 'taught', '.']

        self.assertEquals(expected, result)

示例#2

0

显示文件

def set_number_of_tokens():
    from includes.tokenizer import tokenize
    from pymongo import MongoClient
    client = MongoClient()
    db = client[config.DATABASE_NAME]

    for document in db.document.find():
        document['number_of_words'] = len(tokenize(document['text']))
        db.document.save(document)

示例#3

0

显示文件

文件： cccp.py 项目： finiteautomata/leninanalysis

def set_number_of_tokens():
    from includes.tokenizer import tokenize
    from pymongo import MongoClient
    client = MongoClient()
    db = client[config.DATABASE_NAME]

    for document in db.document.find():
        document['number_of_words'] = len(tokenize(document['text']))
        db.document.save(document)

示例#4

0

显示文件

    def test_tokenize_clean_stop_words(self):
        result = tokenize(
            "A written language is the representation of a language by means of a writing system. Written language is an invention in that it must be taught to children; children will pick up spoken language (oral or sign) by exposure without being specifically taught.",
            clean_stop_words=True)

        expected = [
            'written', 'language', 'representation', 'language', 'means',
            'writing', 'system', 'Written', 'language', 'invention', 'must',
            'taught', 'children', 'children', 'pick', 'spoken', 'language',
            'oral', 'sign', 'exposure', 'without', 'specifically', 'taught'
        ]
        self.assertEquals(expected, result)

示例#5

0

显示文件

文件： test_window.py 项目： finiteautomata/leninanalysis

 def test_moby_dick_window(self):
     #just make sure we
     window_sizes = xrange(100, 6000, 100)
     text = gutenberg.raw('melville-moby_dick.txt')
     tokens = tokenize(text, only_alphanum=True, clean_punctuation=True)
     total_number_of_tokens = len(tokens)
     for window_size in window_sizes:
         count = 0
         number_of_windows = int(math.ceil( total_number_of_tokens / window_size))
         for current_window in range(0, number_of_windows+1):
             word_window = Window(tokens, window_size, current_window)
             for word in word_window:
                 count += 1
         self.assertEquals(count, total_number_of_tokens)

示例#6

0

显示文件

    def test_tokenize_happy_path(self):
        result = tokenize(
            "A written language is the representation of a language by means of a writing system. Written language is an invention in that it must be taught to children; children will pick up spoken language (oral or sign) by exposure without being specifically taught."
        )

        expected = [
            'written', 'language', 'is', 'the', 'representation', 'of',
            'language', 'by', 'means', 'of', 'writing', 'system', 'Written',
            'language', 'is', 'an', 'invention', 'in', 'that', 'it', 'must',
            'be', 'taught', 'to', 'children', 'children', 'will', 'pick', 'up',
            'spoken', 'language', 'oral', 'or', 'sign', 'by', 'exposure',
            'without', 'being', 'specifically', 'taught'
        ]

        self.assertEquals(expected, result)

示例#7

0

显示文件

文件： test_window.py 项目： dataista0/leninanalysis

 def test_moby_dick_window(self):
     #just make sure we
     window_sizes = xrange(100, 6000, 100)
     text = gutenberg.raw('melville-moby_dick.txt')
     tokens = tokenize(text, only_alphanum=True, clean_punctuation=True)
     total_number_of_tokens = len(tokens)
     for window_size in window_sizes:
         count = 0
         number_of_windows = int(
             math.ceil(total_number_of_tokens / window_size))
         for current_window in range(0, number_of_windows + 1):
             word_window = Window(tokens, window_size, current_window)
             for word in word_window:
                 count += 1
         self.assertEquals(count, total_number_of_tokens)

示例#8

0

显示文件

    def test_tokenize_clean_punct_false(self):

        result = tokenize(
            "A written language is the representation of a language by means of a writing system. Written language is an invention in that it must be taught to children; children will pick up spoken language (oral or sign) by exposure without being specifically taught.",
            clean_punctuation=False,
            only_alphanum=False)

        expected = [
            'A', 'written', 'language', 'is', 'the', 'representation', 'of',
            'a', 'language', 'by', 'means', 'of', 'a', 'writing', 'system',
            '.', 'Written', 'language', 'is', 'an', 'invention', 'in', 'that',
            'it', 'must', 'be', 'taught', 'to', 'children', ';', 'children',
            'will', 'pick', 'up', 'spoken', 'language', '(', 'oral', 'or',
            'sign', ')', 'by', 'exposure', 'without', 'being', 'specifically',
            'taught', '.'
        ]

        self.assertEquals(expected, result)

示例#9

0

显示文件

 def tokenizer_wrapper(raw_text):
     return map(
         str.lower,
         tokenize(raw_text, only_alphanum=True, clean_punctuation=True))

示例#10

0

显示文件

文件： test_includes.py 项目： finiteautomata/leninanalysis

 def test_tokenize_only_alpha(self):
     result = tokenize("123 lalala 123 pepepe 4566 sarasaa", only_alpha = True)
     expected = ['lalala', 'pepepe', 'sarasaa']
     self.assertEquals(expected, result)

示例#11

0

显示文件

文件： test_includes.py 项目： finiteautomata/leninanalysis

    def test_tokenize_clean_stop_words(self):
        result = tokenize("A written language is the representation of a language by means of a writing system. Written language is an invention in that it must be taught to children; children will pick up spoken language (oral or sign) by exposure without being specifically taught.", clean_stop_words=True)

        expected = ['written', 'language', 'representation', 'language', 'means', 'writing', 'system', 'Written', 'language', 'invention', 'must', 'taught', 'children', 'children', 'pick', 'spoken', 'language', 'oral', 'sign', 'exposure', 'without', 'specifically', 'taught']
        self.assertEquals(expected, result)

示例#12

0

显示文件

文件： test_includes.py 项目： finiteautomata/leninanalysis

    def test_tokenize_happy_path(self):
        result = tokenize("A written language is the representation of a language by means of a writing system. Written language is an invention in that it must be taught to children; children will pick up spoken language (oral or sign) by exposure without being specifically taught.")

        expected = ['written', 'language', 'is', 'the', 'representation', 'of', 'language', 'by', 'means', 'of', 'writing', 'system', 'Written', 'language', 'is', 'an', 'invention', 'in', 'that', 'it', 'must', 'be', 'taught', 'to', 'children', 'children', 'will', 'pick', 'up', 'spoken', 'language', 'oral', 'or', 'sign', 'by', 'exposure', 'without', 'being', 'specifically', 'taught']

        self.assertEquals(expected, result)

示例#13

0

显示文件

文件： moby_dick.py 项目： dataista0/leninanalysis

def get_moby_dick_tokens():
    moby_dick = gutenberg.raw('melville-moby_dick.txt')
    tokens = tokenize(moby_dick, only_alphanum=True, clean_punctuation=True)
    return [token.lower() for token in tokens]

示例#14

0

显示文件

 def test_tokenize_only_alpha(self):
     result = tokenize("123 lalala 123 pepepe 4566 sarasaa",
                       only_alpha=True)
     expected = ['lalala', 'pepepe', 'sarasaa']
     self.assertEquals(expected, result)

示例#15

0

显示文件

文件： moby_dick_tests.py 项目： finiteautomata/leninanalysis

 def tokenizer_wrapper(raw_text):
     return map(str.lower, map(str, tokenize(raw_text, only_alphanum=True, clean_punctuation=True)))