def _parse_entry(str): for field in tokenize.line(str): field = field[1:].strip() # remove leading backslash and trailing whitespace field = tuple(split(field, sep=" ", maxsplit=1)) if len(field) == 1: field = (field[0], None) yield field
def conll_chunk(s, chunk_types=("NP",), top_node="S"): """ @return: A chunk structure for a single sentence encoded in the given CONLL 2000 style string. @rtype: L{Tree} """ stack = [Tree(top_node, [])] for lineno, line in enumerate(tokenize.line(s)): # Decode the line. match = _LINE_RE.match(line) if match is None: raise ValueError, 'Error on line %d' % lineno (word, tag, state, chunk_type) = match.groups() # If it's a chunk type we don't care about, treat it as O. if (chunk_types is not None and chunk_type not in chunk_types): state = 'O' # For "Begin"/"Outside", finish any completed chunks - # also do so for "Inside" which don't match the previous token. mismatch_I = state == 'I' and chunk_type != stack[-1].node if state in 'BO' or mismatch_I: if len(stack) == 2: stack.pop() # For "Begin", start a new chunk. if state == 'B' or mismatch_I: chunk = Tree(chunk_type, []) stack[-1].append(chunk) stack.append(chunk) # Add the new word token. stack[-1].append((word, tag)) return stack[0]
def _list_sent(sent): return [tokenize.whitespace(line) for line in tokenize.line(sent)]