예제 #1
0
def tagged(files = items):
    if type(files) is str: files = (files,)
    for file in files:
        path = os.path.join(get_basedir(), "conll2000", file + ".txt")
        s = open(path).read()
        for sent in tokenize.blankline(s):
            yield [(word, tag) for (word, tag, chunk) in _list_sent(sent)]
예제 #2
0
def chunked(files = items, chunk_types=('NP',)):
    if type(files) is str: files = (files,)
    for file in files:
        path = os.path.join(get_basedir(), "conll2000", file + ".txt")
        s = open(path).read()
        for sent in tokenize.blankline(s):
            yield tree.conll_chunk(sent, chunk_types)
예제 #3
0
def demo():
    """
    A demonstration that shows the output of several different
    tokenizers on the same string.
    """

    from en.parser.nltk_lite import tokenize

    # Define the test string.
    s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
    print 'Input text:'
    print `s`
    print
    print 'Tokenize using whitespace:'
    _display(tokenize.whitespace(s))
    print
    print 'Tokenize sequences of alphanumeric characters:'
    _display(tokenize.regexp(s, pattern=r'\w+', gaps=False))
    print
    print 'Tokenize sequences of letters and sequences of nonletters:'
    _display(tokenize.wordpunct(s))
    print
    print 'Tokenize by lines:'
    _display(tokenize.line(s))
    print
    print 'Tokenize by blank lines:'
    _display(tokenize.blankline(s))
    print
    print 'A simple sentence tokenizer:'
    _display(tokenize.regexp(s, pattern=r'\.(\s+|$)', gaps=True))
    print
예제 #4
0
def demo():
    """
    A demonstration that shows the output of several different
    tokenizers on the same string.
    """

    from en.parser.nltk_lite import tokenize

    # Define the test string.
    s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
    print 'Input text:'
    print ` s `
    print
    print 'Tokenize using whitespace:'
    _display(tokenize.whitespace(s))
    print
    print 'Tokenize sequences of alphanumeric characters:'
    _display(tokenize.regexp(s, pattern=r'\w+', gaps=False))
    print
    print 'Tokenize sequences of letters and sequences of nonletters:'
    _display(tokenize.wordpunct(s))
    print
    print 'Tokenize by lines:'
    _display(tokenize.line(s))
    print
    print 'Tokenize by blank lines:'
    _display(tokenize.blankline(s))
    print
    print 'A simple sentence tokenizer:'
    _display(tokenize.regexp(s, pattern=r'\.(\s+|$)', gaps=True))
    print
예제 #5
0
def _read(files, conversion_function):
    if type(files) is str: files = (files,)

    for file in files:
        path = os.path.join(get_basedir(), "brown", file)
        f = open(path).read()
        for sent in tokenize.blankline(f):
            yield conversion_function(sent)
예제 #6
0
def _read(files, conversion_function):
    if type(files) is str: files = (files, )

    for file in files:
        path = os.path.join(get_basedir(), "brown", file)
        f = open(path).read()
        for sent in tokenize.blankline(f):
            yield conversion_function(sent)
예제 #7
0
def chunked(files=items, chunk_types=("NP",)):
    if type(files) is str:
        files = (files,)
    for file in files:
        path = os.path.join(get_basedir(), "conll2000", file + ".txt")
        s = open(path).read()
        for sent in tokenize.blankline(s):
            yield tree.conll_chunk(sent, chunk_types)
예제 #8
0
def tagged(files=items):
    if type(files) is str:
        files = (files,)
    for file in files:
        path = os.path.join(get_basedir(), "conll2000", file + ".txt")
        s = open(path).read()
        for sent in tokenize.blankline(s):
            yield [(word, tag) for (word, tag, chunk) in _list_sent(sent)]
예제 #9
0
def chunked(files='chunked'):
    """
    @param files: One or more treebank files to be processed
    @type files: L{string} or L{tuple(string)}
    @rtype: iterator over L{tree}
    """

    # Just one file to process?  If so convert to a tuple so we can iterate
    if type(files) is str: files = (files, )

    for file in files:
        path = os.path.join(get_basedir(), "treebank", file)
        s = open(path).read()
        for t in tokenize.blankline(s):
            yield tree.chunk(t)
예제 #10
0
def _read(files, conversion_function):
    if type(files) is str: files = (files,)

    for file in files:
        path = os.path.join(get_basedir(), "ycoe/pos", file)
        f = open(path).read()
        rx_pattern = re.compile(r"""
                <.*>_CODE
                |\s.*_ID
        """, re.VERBOSE|re.UNICODE)
        mySents = tokenize.blankline(f)
        for sent in mySents:
            sent= re.sub(rx_pattern, '', sent)
            if sent != "":
                yield conversion_function(sent, sep="_")
예제 #11
0
def _read(files, conversion_function):
    if type(files) is str: files = (files, )

    for file in files:
        path = os.path.join(get_basedir(), "ycoe/pos", file)
        f = open(path).read()
        rx_pattern = re.compile(
            r"""
                <.*>_CODE
                |\s.*_ID
        """, re.VERBOSE | re.UNICODE)
        mySents = tokenize.blankline(f)
        for sent in mySents:
            sent = re.sub(rx_pattern, '', sent)
            if sent != "":
                yield conversion_function(sent, sep="_")
예제 #12
0
def raw(files='raw'):
    """
    @param files: One or more treebank files to be processed
    @type files: L{string} or L{tuple(string)}
    @rtype: iterator over L{list(string)}
    """

    # Just one file to process?  If so convert to a tuple so we can iterate
    if type(files) is str: files = (files, )

    for file in files:
        path = os.path.join(get_basedir(), "treebank", file)
        f = open(path).read()
        for sent in tokenize.blankline(f):
            l = []
            for t in tokenize.whitespace(sent):
                l.append(t)
            yield l