示例#1
0
def parse_spacy(passages, lang, verbose=False):
    for passage, in annotate_all(zip(passages),
                                 as_array=True,
                                 as_tuples=True,
                                 lang=lang,
                                 verbose=verbose):
        terminals = sorted(passage.layer(layer0.LAYER_ID).all,
                           key=operator.attrgetter("position"))
        dep_nodes = [ConlluConverter.Node()] + [
            ConlluConverter.Node(t.position,
                                 terminal=t,
                                 token=ConlluConverter.Token(t.text, t.tag))
            for t in terminals
        ]
        for dep_node in dep_nodes[1:]:
            dep_node.token.paragraph = dep_node.terminal.paragraph
            head = Attr.HEAD(dep_node.terminal.tok[Attr.HEAD.value])
            if head:
                head += dep_node.position
            rel = Attr.DEP(dep_node.terminal.tok[Attr.DEP.value],
                           lang=passage.attrib.get("lang", lang))
            assert head is not None and rel is not None, \
                "head=%r, rel=%r for token %d in:\n%s" % (head, rel, dep_node.position, " ".join(map(str, terminals)))
            edge = ConlluConverter.Edge(head, rel, remote=False)
            dep_node.terminal = None
            edge.link_head(dep_nodes)
            dep_node.add_edges([edge])
        parsed = ConlluConverter().build_passage(dep_nodes, passage.ID)
        yield passage, parsed
示例#2
0
def lemmatize(terminal):
    try:
        lemma = Attr.LEMMA(terminal.tok[Attr.LEMMA.value])
    except KeyError:
        return None
    if lemma == "-PRON-":
        lemma = terminal.text
    return lemma.translate(PUNCTUATION_REMOVER).lower() if lemma else None