def chunked(files = items, chunk_types=('NP',)):
    if type(files) is str: files = (files,)
    for file in files:
        path = os.path.join(get_basedir(), "conll2000", file + ".txt")
        s = open(path).read()
        for sent in tokenize.blankline(s):
            yield tree.conll_chunk(sent, chunk_types)
def tagged(files = items):
    if type(files) is str: files = (files,)
    for file in files:
        path = os.path.join(get_basedir(), "conll2000", file + ".txt")
        s = open(path).read()
        for sent in tokenize.blankline(s):
            yield [(word, tag) for (word, tag, chunk) in _list_sent(sent)]
def _read(files, conversion_function):
    if type(files) is str: files = (files,)

    for file in files:
        path = os.path.join(get_basedir(), "brown", file)
        f = open(path).read()
        for sent in tokenize.blankline(f):
            yield conversion_function(sent)
def raw(files = 'rotokas'):
    """
    @param files: One or more treebank files to be processed
    @type files: L{string} or L{tuple(string)}
    @rtype: iterator over L{list(string)}
    """       

    # Just one file to process?  If so convert to a tuple so we can iterate
    if type(files) is str: files = (files,)

    for file in files:
        path = os.path.join(get_basedir(), "shoebox", file + ".dic")
        f = open(path).read()
        for entry in tokenize.blankline(f):
            yield list(_parse_entry(entry))
def _read(files, conversion_function):
    if type(files) is str:
        files = (files,)

    for file in files:
        path = os.path.join(get_basedir(), "ycoe/pos", file)
        f = open(path).read()
        rx_pattern = re.compile(
            r"""
                <.*>_CODE
                |\s.*_ID
        """,
            re.VERBOSE | re.UNICODE,
        )
        mySents = tokenize.blankline(f)
        for sent in mySents:
            sent = re.sub(rx_pattern, "", sent)
            if sent != "":
                yield conversion_function(sent, sep="_")
示例#6
0
 def processParagraphs(self, corpus):
     paragraphs = tokenize.blankline(corpus)
     return paragraphs
示例#7
0
    def processParagraphs(self, corpus):
	paragraphs = tokenize.blankline(corpus)
	return paragraphs