def _process_text(parser, session, text, all_names, xform): """ Low-level utility function to parse text and return the result of a transformation function (xform) for each sentence. Set all_names = True to get a comprehensive name register. Set all_names = False to get a simple name register. Set all_names = None to get no name register. """ t0 = time.time() # Demarcate paragraphs in the input text = mark_paragraphs(text) # Tokenize the result token_stream = tokenize(text) toklist = list( recognize_entities(token_stream, enclosing_session=session)) t1 = time.time() pgs, stats = TreeUtility._process_toklist(parser, session, toklist, xform) if all_names is None: register = None else: from queries.builtin import create_name_register register = create_name_register(toklist, session, all_names=all_names) t2 = time.time() stats["tok_time"] = t1 - t0 stats["parse_time"] = t2 - t1 stats["total_time"] = t2 - t0 return (pgs, stats, register)
def tag_toklist(session, toklist, all_names=False): """ Parse plain text and return the parsed paragraphs as lists of sentences where each sentence is a list of tagged tokens """ def xform(tokens, tree, err_index): """ Transformation function that simply returns a list of POS-tagged, normalized tokens for the sentence """ return TreeUtility.dump_tokens(tokens, tree, error_index=err_index) with Fast_Parser( verbose=False) as parser: # Don't emit diagnostic messages pgs, stats = TreeUtility._process_toklist(parser, session, toklist, xform) from queries.builtin import create_name_register register = create_name_register(toklist, session, all_names=all_names) return pgs, stats, register