Пример #1
0
def demo():
    """
    A demonstration that shows the output of several different
    tokenizers on the same string.
    """

    from en.parser.nltk_lite import tokenize

    # Define the test string.
    s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
    print 'Input text:'
    print ` s `
    print
    print 'Tokenize using whitespace:'
    _display(tokenize.whitespace(s))
    print
    print 'Tokenize sequences of alphanumeric characters:'
    _display(tokenize.regexp(s, pattern=r'\w+', gaps=False))
    print
    print 'Tokenize sequences of letters and sequences of nonletters:'
    _display(tokenize.wordpunct(s))
    print
    print 'Tokenize by lines:'
    _display(tokenize.line(s))
    print
    print 'Tokenize by blank lines:'
    _display(tokenize.blankline(s))
    print
    print 'A simple sentence tokenizer:'
    _display(tokenize.regexp(s, pattern=r'\.(\s+|$)', gaps=True))
    print
Пример #2
0
def raw(files = items):
    if type(files) is str: files = (files,)

    for file in files:
        path = os.path.join(get_basedir(), "state_union", file + ".txt")
        f = open(path)
        preamble = True
        text = f.read()
        for t in tokenize.wordpunct(text):
            yield t
Пример #3
0
def raw(files=items):
    if type(files) is str: files = (files, )

    for file in files:
        path = os.path.join(get_basedir(), "gutenberg", file + ".txt")
        f = open(path)
        preamble = True
        for line in f.readlines():
            if not preamble:
                for t in tokenize.wordpunct(line):
                    yield t
            if line[:5] == '*END*':
                preamble = False