def parse_spacy(passages, lang, verbose=False): for passage, in annotate_all(zip(passages), as_array=True, as_tuples=True, lang=lang, verbose=verbose): terminals = sorted(passage.layer(layer0.LAYER_ID).all, key=operator.attrgetter("position")) dep_nodes = [ConlluConverter.Node()] + [ ConlluConverter.Node(t.position, terminal=t, token=ConlluConverter.Token(t.text, t.tag)) for t in terminals ] for dep_node in dep_nodes[1:]: dep_node.token.paragraph = dep_node.terminal.paragraph head = Attr.HEAD(dep_node.terminal.tok[Attr.HEAD.value]) if head: head += dep_node.position rel = Attr.DEP(dep_node.terminal.tok[Attr.DEP.value], lang=passage.attrib.get("lang", lang)) assert head is not None and rel is not None, \ "head=%r, rel=%r for token %d in:\n%s" % (head, rel, dep_node.position, " ".join(map(str, terminals))) edge = ConlluConverter.Edge(head, rel, remote=False) dep_node.terminal = None edge.link_head(dep_nodes) dep_node.add_edges([edge]) parsed = ConlluConverter().build_passage(dep_nodes, passage.ID) yield passage, parsed
def lemmatize(terminal): try: lemma = Attr.LEMMA(terminal.tok[Attr.LEMMA.value]) except KeyError: return None if lemma == "-PRON-": lemma = terminal.text return lemma.translate(PUNCTUATION_REMOVER).lower() if lemma else None