示例#1
0
文件: conllu.py 项目: spacy-io/spaCy
def read_data(
    nlp,
    conllu_file,
    text_file,
    raw_text=True,
    oracle_segments=False,
    max_doc_length=None,
    limit=None,
):
    """Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
    include Doc objects created using nlp.make_doc and then aligned against
    the gold-standard sequences. If oracle_segments=True, include Doc objects
    created from the gold-standard segments. At least one must be True."""
    if not raw_text and not oracle_segments:
        raise ValueError("At least one of raw_text or oracle_segments must be True")
    paragraphs = split_text(text_file.read())
    conllu = read_conllu(conllu_file)
    # sd is spacy doc; cd is conllu doc
    # cs is conllu sent, ct is conllu token
    docs = []
    golds = []
    for doc_id, (text, cd) in enumerate(zip(paragraphs, conllu)):
        sent_annots = []
        for cs in cd:
            sent = defaultdict(list)
            for id_, word, lemma, pos, tag, morph, head, dep, _, space_after in cs:
                if "." in id_:
                    continue
                if "-" in id_:
                    continue
                id_ = int(id_) - 1
                head = int(head) - 1 if head != "0" else id_
                sent["words"].append(word)
                sent["tags"].append(tag)
                sent["heads"].append(head)
                sent["deps"].append("ROOT" if dep == "root" else dep)
                sent["spaces"].append(space_after == "_")
            sent["entities"] = ["-"] * len(sent["words"])
            sent["heads"], sent["deps"] = projectivize(sent["heads"], sent["deps"])
            if oracle_segments:
                docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"]))
                golds.append(GoldParse(docs[-1], **sent))

            sent_annots.append(sent)
            if raw_text and max_doc_length and len(sent_annots) >= max_doc_length:
                doc, gold = _make_gold(nlp, None, sent_annots)
                sent_annots = []
                docs.append(doc)
                golds.append(gold)
                if limit and len(docs) >= limit:
                    return docs, golds

        if raw_text and sent_annots:
            doc, gold = _make_gold(nlp, None, sent_annots)
            docs.append(doc)
            golds.append(gold)
        if limit and len(docs) >= limit:
            return docs, golds
    return docs, golds
示例#2
0
def extract_docs_and_golds(nlp, conllu_file):
    parsed_sentences = []
    gold_sentences = []
    documents = defaultdict(list)
    documents_gold_sentences = defaultdict(list)
    gold_segmentation = defaultdict(list)

    with open(conllu_file, "r", encoding="utf8") as conllu:
        for chunk in conllu.read().split('\n\n')[:-1]:
            lines = chunk.split('\n')
            if lines[0].startswith('# newdoc'):
                docid, sentid = lines[1].split(' ')[-2:]
                lines = lines[1:]
            elif lines[0].startswith('# source'):
                docid, sentid = 0, 0
            elif lines[0].startswith('# text'):
                docid, sentid = lines[0].split(' ')[-2:]
                lines = [''] + lines
            else:
                docid, sentid = lines[0].split(' ')[-1].rsplit('_', 1)
            text = lines[1].split('=')[-1].strip()
            if lines[2].startswith('# sent_id'):
                lines = lines[1:]
            sent_words = []
            sent_tags = []
            sent_heads = []
            sent_deps = []
            for line in lines[2:]:
                id_, word, lemma, pos, tag, morph, head, dep, _1, _2 = line.split('\t')
                if '.' in id_:
                    continue
                if '-' in id_:
                    continue
                id_ = int(id_) - 1
                try:
                    head = int(head) - 1 if head != '0' else id_
                except ValueError:
                    head = id_
                sent_words.append(word)
                sent_tags.append(tag)
                sent_heads.append(head)
                sent_deps.append('ROOT' if dep == 'root' else dep)
            sent_heads, sent_deps = projectivize(sent_heads, sent_deps)
            # text should be cleaned, because removing trailing spaces is not point of spaCy at all
            # and should not be evaluated
            text = re.sub('\s+', ' ', text).strip()
            parsed_sentences.append(nlp(text))
            gold = GoldParse(Doc(nlp.vocab, words=sent_words), words=sent_words, heads=sent_heads,
                             tags=sent_tags, deps=sent_deps,
                             entities=['-'] * len(sent_words))
            gold_sentences.append(gold)
            documents[docid].append(text)
            documents_gold_sentences[docid].append(gold)
            gold_segmentation[docid].append([1] + [0] * (len(sent_words) - 1))
    return parsed_sentences, gold_sentences, gold_segmentation, documents, documents_gold_sentences
示例#3
0
文件: conllu.py 项目: zby0902/spaCy
def read_data(nlp, conllu_file, text_file, raw_text=True, oracle_segments=False,
              max_doc_length=None, limit=None):
    '''Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
    include Doc objects created using nlp.make_doc and then aligned against
    the gold-standard sequences. If oracle_segments=True, include Doc objects
    created from the gold-standard segments. At least one must be True.'''
    if not raw_text and not oracle_segments:
        raise ValueError("At least one of raw_text or oracle_segments must be True")
    paragraphs = split_text(text_file.read())
    conllu = read_conllu(conllu_file)
    # sd is spacy doc; cd is conllu doc
    # cs is conllu sent, ct is conllu token
    docs = []
    golds = []
    for doc_id, (text, cd) in enumerate(zip(paragraphs, conllu)):
        sent_annots = []
        for cs in cd:
            sent = defaultdict(list)
            for id_, word, lemma, pos, tag, morph, head, dep, _, space_after in cs:
                if '.' in id_:
                    continue
                if '-' in id_:
                    continue
                id_ = int(id_)-1
                head = int(head)-1 if head != '0' else id_
                sent['words'].append(word)
                sent['tags'].append(tag)
                sent['heads'].append(head)
                sent['deps'].append('ROOT' if dep == 'root' else dep)
                sent['spaces'].append(space_after == '_')
            sent['entities'] = ['-'] * len(sent['words'])
            sent['heads'], sent['deps'] = projectivize(sent['heads'],
                                                       sent['deps'])
            if oracle_segments:
                docs.append(Doc(nlp.vocab, words=sent['words'], spaces=sent['spaces']))
                golds.append(GoldParse(docs[-1], **sent))

            sent_annots.append(sent)
            if raw_text and max_doc_length and len(sent_annots) >= max_doc_length:
                doc, gold = _make_gold(nlp, None, sent_annots)
                sent_annots = []
                docs.append(doc)
                golds.append(gold)
                if limit and len(docs) >= limit:
                    return docs, golds

        if raw_text and sent_annots:
            doc, gold = _make_gold(nlp, None, sent_annots)
            docs.append(doc)
            golds.append(gold)
        if limit and len(docs) >= limit:
            return docs, golds
    return docs, golds
def test_get_oracle_actions():
    doc = Doc(Vocab(), words=[t[1] for t in annot_tuples])
    parser = DependencyParser(doc.vocab)
    parser.moves.add_action(0, "")
    parser.moves.add_action(1, "")
    parser.moves.add_action(1, "")
    parser.moves.add_action(4, "ROOT")
    for i, (id_, word, tag, head, dep, ent) in enumerate(annot_tuples):
        if head > i:
            parser.moves.add_action(2, dep)
        elif head < i:
            parser.moves.add_action(3, dep)
    ids, words, tags, heads, deps, ents = zip(*annot_tuples)
    heads, deps = projectivize(heads, deps)
    gold = GoldParse(doc, words=words, tags=tags, heads=heads, deps=deps)
    parser.moves.preprocess_gold(gold)
    parser.moves.get_oracle_sequence(doc, gold)
示例#5
0
def test_get_oracle_actions():
    doc = Doc(Vocab(), words=[t[1] for t in annot_tuples])
    parser = DependencyParser(doc.vocab)
    parser.moves.add_action(0, "")
    parser.moves.add_action(1, "")
    parser.moves.add_action(1, "")
    parser.moves.add_action(4, "ROOT")
    for i, (id_, word, tag, head, dep, ent) in enumerate(annot_tuples):
        if head > i:
            parser.moves.add_action(2, dep)
        elif head < i:
            parser.moves.add_action(3, dep)
    ids, words, tags, heads, deps, ents = zip(*annot_tuples)
    heads, deps = projectivize(heads, deps)
    gold = GoldParse(doc, words=words, tags=tags, heads=heads, deps=deps)
    parser.moves.preprocess_gold(gold)
    parser.moves.get_oracle_sequence(doc, gold)
示例#6
0
def read_data(
    nlp,
    conllu_file,
    text_file,
    raw_text=True,
    oracle_segments=False,
    max_doc_length=None,
    limit=None,
):
    """Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
    include Doc objects created using nlp.make_doc and then aligned against
    the gold-standard sequences. If oracle_segments=True, include Doc objects
    created from the gold-standard segments. At least one must be True."""
    if not raw_text and not oracle_segments:
        raise ValueError(
            "At least one of raw_text or oracle_segments must be True")
    paragraphs = split_text(text_file.read())
    conllu = read_conllu(conllu_file)
    # sd is spacy doc; cd is conllu doc
    # cs is conllu sent, ct is conllu token
    docs = []
    golds = []
    for doc_id, (text, cd) in enumerate(zip(paragraphs, conllu)):
        sent_annots = []
        for cs in cd:
            sent = defaultdict(list)
            for id_, word, lemma, pos, tag, morph, head, dep, _, space_after in cs:
                if "." in id_:
                    continue
                if "-" in id_:
                    continue
                id_ = int(id_) - 1
                head = int(head) - 1 if head != "0" else id_
                sent["words"].append(word)
                sent["tags"].append(tag)
                sent["morphology"].append(_parse_morph_string(morph))
                sent["morphology"][-1].add("POS_%s" % pos)
                sent["heads"].append(head)
                sent["deps"].append("ROOT" if dep == "root" else dep)
                sent["spaces"].append(space_after == "_")
            sent["entities"] = ["-"] * len(sent["words"])
            sent["heads"], sent["deps"] = projectivize(sent["heads"],
                                                       sent["deps"])
            if oracle_segments:
                docs.append(
                    Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"]))
                golds.append(GoldParse(docs[-1], **sent))
                assert golds[-1].morphology is not None

            sent_annots.append(sent)
            if raw_text and max_doc_length and len(
                    sent_annots) >= max_doc_length:
                doc, gold = _make_gold(nlp, None, sent_annots)
                assert gold.morphology is not None
                sent_annots = []
                docs.append(doc)
                golds.append(gold)
                if limit and len(docs) >= limit:
                    return docs, golds

        if raw_text and sent_annots:
            doc, gold = _make_gold(nlp, None, sent_annots)
            docs.append(doc)
            golds.append(gold)
        if limit and len(docs) >= limit:
            return docs, golds
    return docs, golds
示例#7
0
def test_parser_pseudoprojectivity(en_tokenizer):
    def deprojectivize(proj_heads, deco_labels):
        tokens = en_tokenizer("whatever " * len(proj_heads))
        rel_proj_heads = [head - i for i, head in enumerate(proj_heads)]
        doc = get_doc(
            tokens.vocab,
            words=[t.text for t in tokens],
            deps=deco_labels,
            heads=rel_proj_heads,
        )
        nonproj.deprojectivize(doc)
        return [t.head.i for t in doc], [token.dep_ for token in doc]

    # fmt: off
    tree = [1, 2, 2]
    nonproj_tree = [1, 2, 2, 4, 5, 2, 7, 4, 2]
    nonproj_tree2 = [9, 1, 3, 1, 5, 6, 9, 8, 6, 1, 6, 12, 13, 10, 1]
    labels = [
        "det", "nsubj", "root", "det", "dobj", "aux", "nsubj", "acl", "punct"
    ]
    labels2 = [
        "advmod", "root", "det", "nsubj", "advmod", "det", "dobj", "det",
        "nmod", "aux", "nmod", "advmod", "det", "amod", "punct"
    ]
    # fmt: on

    assert nonproj.decompose("X||Y") == ("X", "Y")
    assert nonproj.decompose("X") == ("X", "")
    assert nonproj.is_decorated("X||Y") is True
    assert nonproj.is_decorated("X") is False

    nonproj._lift(0, tree)
    assert tree == [2, 2, 2]

    assert nonproj._get_smallest_nonproj_arc(nonproj_tree) == 7
    assert nonproj._get_smallest_nonproj_arc(nonproj_tree2) == 10

    # fmt: off
    proj_heads, deco_labels = nonproj.projectivize(nonproj_tree, labels)
    assert proj_heads == [1, 2, 2, 4, 5, 2, 7, 5, 2]
    assert deco_labels == [
        "det", "nsubj", "root", "det", "dobj", "aux", "nsubj", "acl||dobj",
        "punct"
    ]

    deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels)
    assert deproj_heads == nonproj_tree
    assert undeco_labels == labels

    proj_heads, deco_labels = nonproj.projectivize(nonproj_tree2, labels2)
    assert proj_heads == [1, 1, 3, 1, 5, 6, 9, 8, 6, 1, 9, 12, 13, 10, 1]
    assert deco_labels == [
        "advmod||aux", "root", "det", "nsubj", "advmod", "det", "dobj", "det",
        "nmod", "aux", "nmod||dobj", "advmod", "det", "amod", "punct"
    ]

    deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels)
    assert deproj_heads == nonproj_tree2
    assert undeco_labels == labels2

    # if decoration is wrong such that there is no head with the desired label
    # the structure is kept and the label is undecorated
    proj_heads = [1, 2, 2, 4, 5, 2, 7, 5, 2]
    deco_labels = [
        "det", "nsubj", "root", "det", "dobj", "aux", "nsubj", "acl||iobj",
        "punct"
    ]

    deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels)
    assert deproj_heads == proj_heads
    assert undeco_labels == [
        "det", "nsubj", "root", "det", "dobj", "aux", "nsubj", "acl", "punct"
    ]

    # if there are two potential new heads, the first one is chosen even if
    # it"s wrong
    proj_heads = [1, 1, 3, 1, 5, 6, 9, 8, 6, 1, 9, 12, 13, 10, 1]
    deco_labels = [
        "advmod||aux", "root", "det", "aux", "advmod", "det", "dobj", "det",
        "nmod", "aux", "nmod||dobj", "advmod", "det", "amod", "punct"
    ]

    deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels)
    assert deproj_heads == [3, 1, 3, 1, 5, 6, 9, 8, 6, 1, 6, 12, 13, 10, 1]
    assert undeco_labels == [
        "advmod", "root", "det", "aux", "advmod", "det", "dobj", "det", "nmod",
        "aux", "nmod", "advmod", "det", "amod", "punct"
    ]
示例#8
0
def test_parser_pseudoprojectivity(en_tokenizer):
    def deprojectivize(proj_heads, deco_labels):
        tokens = en_tokenizer("whatever " * len(proj_heads))
        rel_proj_heads = [head - i for i, head in enumerate(proj_heads)]
        doc = get_doc(
            tokens.vocab,
            words=[t.text for t in tokens],
            deps=deco_labels,
            heads=rel_proj_heads,
        )
        nonproj.deprojectivize(doc)
        return [t.head.i for t in doc], [token.dep_ for token in doc]

    # fmt: off
    tree = [1, 2, 2]
    nonproj_tree = [1, 2, 2, 4, 5, 2, 7, 4, 2]
    nonproj_tree2 = [9, 1, 3, 1, 5, 6, 9, 8, 6, 1, 6, 12, 13, 10, 1]
    labels = ["det", "nsubj", "root", "det", "dobj", "aux", "nsubj", "acl", "punct"]
    labels2 = ["advmod", "root", "det", "nsubj", "advmod", "det", "dobj", "det", "nmod", "aux", "nmod", "advmod", "det", "amod", "punct"]
    # fmt: on

    assert nonproj.decompose("X||Y") == ("X", "Y")
    assert nonproj.decompose("X") == ("X", "")
    assert nonproj.is_decorated("X||Y") is True
    assert nonproj.is_decorated("X") is False

    nonproj._lift(0, tree)
    assert tree == [2, 2, 2]

    assert nonproj._get_smallest_nonproj_arc(nonproj_tree) == 7
    assert nonproj._get_smallest_nonproj_arc(nonproj_tree2) == 10

    # fmt: off
    proj_heads, deco_labels = nonproj.projectivize(nonproj_tree, labels)
    assert proj_heads == [1, 2, 2, 4, 5, 2, 7, 5, 2]
    assert deco_labels == ["det", "nsubj", "root", "det", "dobj", "aux",
                           "nsubj", "acl||dobj", "punct"]

    deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels)
    assert deproj_heads == nonproj_tree
    assert undeco_labels == labels

    proj_heads, deco_labels = nonproj.projectivize(nonproj_tree2, labels2)
    assert proj_heads == [1, 1, 3, 1, 5, 6, 9, 8, 6, 1, 9, 12, 13, 10, 1]
    assert deco_labels == ["advmod||aux", "root", "det", "nsubj", "advmod",
                           "det", "dobj", "det", "nmod", "aux", "nmod||dobj",
                           "advmod", "det", "amod", "punct"]

    deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels)
    assert deproj_heads == nonproj_tree2
    assert undeco_labels == labels2

    # if decoration is wrong such that there is no head with the desired label
    # the structure is kept and the label is undecorated
    proj_heads = [1, 2, 2, 4, 5, 2, 7, 5, 2]
    deco_labels = ["det", "nsubj", "root", "det", "dobj", "aux", "nsubj",
                   "acl||iobj", "punct"]

    deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels)
    assert deproj_heads == proj_heads
    assert undeco_labels == ["det", "nsubj", "root", "det", "dobj", "aux",
                             "nsubj", "acl", "punct"]

    # if there are two potential new heads, the first one is chosen even if
    # it"s wrong
    proj_heads = [1, 1, 3, 1, 5, 6, 9, 8, 6, 1, 9, 12, 13, 10, 1]
    deco_labels = ["advmod||aux", "root", "det", "aux", "advmod", "det",
                   "dobj", "det", "nmod", "aux", "nmod||dobj", "advmod",
                   "det", "amod", "punct"]

    deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels)
    assert deproj_heads == [3, 1, 3, 1, 5, 6, 9, 8, 6, 1, 6, 12, 13, 10, 1]
    assert undeco_labels == ["advmod", "root", "det", "aux", "advmod", "det",
                             "dobj", "det", "nmod", "aux", "nmod", "advmod",
                             "det", "amod", "punct"]
示例#9
0
def read_data(nlp,
              conllu_file,
              text_file,
              raw_text=True,
              oracle_segments=False,
              limit=None):
    '''Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
    include Doc objects created using nlp.make_doc and then aligned against
    the gold-standard sequences. If oracle_segments=True, include Doc objects
    created from the gold-standard segments. At least one must be True.'''
    if not raw_text and not oracle_segments:
        raise ValueError(
            "At least one of raw_text or oracle_segments must be True")
    paragraphs = split_text(text_file.read())
    conllu = read_conllu(conllu_file)
    # sd is spacy doc; cd is conllu doc
    # cs is conllu sent, ct is conllu token
    docs = []
    golds = []
    for doc_id, (text, cd) in enumerate(zip(paragraphs, conllu)):
        doc_words = []
        doc_tags = []
        doc_heads = []
        doc_deps = []
        doc_ents = []
        for cs in cd:
            sent_words = []
            sent_tags = []
            sent_heads = []
            sent_deps = []
            for id_, word, lemma, pos, tag, morph, head, dep, _1, _2 in cs:
                if '.' in id_:
                    continue
                if '-' in id_:
                    continue
                id_ = int(id_) - 1
                head = int(head) - 1 if head != '0' else id_
                sent_words.append(word)
                sent_tags.append(tag)
                sent_heads.append(head)
                sent_deps.append('ROOT' if dep == 'root' else dep)
            if oracle_segments:
                sent_heads, sent_deps = projectivize(sent_heads, sent_deps)
                docs.append(Doc(nlp.vocab, words=sent_words))
                golds.append(
                    GoldParse(docs[-1],
                              words=sent_words,
                              heads=sent_heads,
                              tags=sent_tags,
                              deps=sent_deps,
                              entities=['-'] * len(sent_words)))
            for head in sent_heads:
                doc_heads.append(len(doc_words) + head)
            doc_words.extend(sent_words)
            doc_tags.extend(sent_tags)
            doc_deps.extend(sent_deps)
            doc_ents.extend(['-'] * len(sent_words))
            # Create a GoldParse object for the sentence
        doc_heads, doc_deps = projectivize(doc_heads, doc_deps)
        if raw_text:
            docs.append(nlp.make_doc(text))
            golds.append(
                GoldParse(docs[-1],
                          words=doc_words,
                          tags=doc_tags,
                          heads=doc_heads,
                          deps=doc_deps,
                          entities=doc_ents))
        if limit and doc_id >= limit:
            break
    return docs, golds