def read_data( nlp, conllu_file, text_file, raw_text=True, oracle_segments=False, max_doc_length=None, limit=None, ): """Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True, include Doc objects created using nlp.make_doc and then aligned against the gold-standard sequences. If oracle_segments=True, include Doc objects created from the gold-standard segments. At least one must be True.""" if not raw_text and not oracle_segments: raise ValueError("At least one of raw_text or oracle_segments must be True") paragraphs = split_text(text_file.read()) conllu = read_conllu(conllu_file) # sd is spacy doc; cd is conllu doc # cs is conllu sent, ct is conllu token docs = [] golds = [] for doc_id, (text, cd) in enumerate(zip(paragraphs, conllu)): sent_annots = [] for cs in cd: sent = defaultdict(list) for id_, word, lemma, pos, tag, morph, head, dep, _, space_after in cs: if "." in id_: continue if "-" in id_: continue id_ = int(id_) - 1 head = int(head) - 1 if head != "0" else id_ sent["words"].append(word) sent["tags"].append(tag) sent["heads"].append(head) sent["deps"].append("ROOT" if dep == "root" else dep) sent["spaces"].append(space_after == "_") sent["entities"] = ["-"] * len(sent["words"]) sent["heads"], sent["deps"] = projectivize(sent["heads"], sent["deps"]) if oracle_segments: docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"])) golds.append(GoldParse(docs[-1], **sent)) sent_annots.append(sent) if raw_text and max_doc_length and len(sent_annots) >= max_doc_length: doc, gold = _make_gold(nlp, None, sent_annots) sent_annots = [] docs.append(doc) golds.append(gold) if limit and len(docs) >= limit: return docs, golds if raw_text and sent_annots: doc, gold = _make_gold(nlp, None, sent_annots) docs.append(doc) golds.append(gold) if limit and len(docs) >= limit: return docs, golds return docs, golds
def extract_docs_and_golds(nlp, conllu_file): parsed_sentences = [] gold_sentences = [] documents = defaultdict(list) documents_gold_sentences = defaultdict(list) gold_segmentation = defaultdict(list) with open(conllu_file, "r", encoding="utf8") as conllu: for chunk in conllu.read().split('\n\n')[:-1]: lines = chunk.split('\n') if lines[0].startswith('# newdoc'): docid, sentid = lines[1].split(' ')[-2:] lines = lines[1:] elif lines[0].startswith('# source'): docid, sentid = 0, 0 elif lines[0].startswith('# text'): docid, sentid = lines[0].split(' ')[-2:] lines = [''] + lines else: docid, sentid = lines[0].split(' ')[-1].rsplit('_', 1) text = lines[1].split('=')[-1].strip() if lines[2].startswith('# sent_id'): lines = lines[1:] sent_words = [] sent_tags = [] sent_heads = [] sent_deps = [] for line in lines[2:]: id_, word, lemma, pos, tag, morph, head, dep, _1, _2 = line.split('\t') if '.' in id_: continue if '-' in id_: continue id_ = int(id_) - 1 try: head = int(head) - 1 if head != '0' else id_ except ValueError: head = id_ sent_words.append(word) sent_tags.append(tag) sent_heads.append(head) sent_deps.append('ROOT' if dep == 'root' else dep) sent_heads, sent_deps = projectivize(sent_heads, sent_deps) # text should be cleaned, because removing trailing spaces is not point of spaCy at all # and should not be evaluated text = re.sub('\s+', ' ', text).strip() parsed_sentences.append(nlp(text)) gold = GoldParse(Doc(nlp.vocab, words=sent_words), words=sent_words, heads=sent_heads, tags=sent_tags, deps=sent_deps, entities=['-'] * len(sent_words)) gold_sentences.append(gold) documents[docid].append(text) documents_gold_sentences[docid].append(gold) gold_segmentation[docid].append([1] + [0] * (len(sent_words) - 1)) return parsed_sentences, gold_sentences, gold_segmentation, documents, documents_gold_sentences
def read_data(nlp, conllu_file, text_file, raw_text=True, oracle_segments=False, max_doc_length=None, limit=None): '''Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True, include Doc objects created using nlp.make_doc and then aligned against the gold-standard sequences. If oracle_segments=True, include Doc objects created from the gold-standard segments. At least one must be True.''' if not raw_text and not oracle_segments: raise ValueError("At least one of raw_text or oracle_segments must be True") paragraphs = split_text(text_file.read()) conllu = read_conllu(conllu_file) # sd is spacy doc; cd is conllu doc # cs is conllu sent, ct is conllu token docs = [] golds = [] for doc_id, (text, cd) in enumerate(zip(paragraphs, conllu)): sent_annots = [] for cs in cd: sent = defaultdict(list) for id_, word, lemma, pos, tag, morph, head, dep, _, space_after in cs: if '.' in id_: continue if '-' in id_: continue id_ = int(id_)-1 head = int(head)-1 if head != '0' else id_ sent['words'].append(word) sent['tags'].append(tag) sent['heads'].append(head) sent['deps'].append('ROOT' if dep == 'root' else dep) sent['spaces'].append(space_after == '_') sent['entities'] = ['-'] * len(sent['words']) sent['heads'], sent['deps'] = projectivize(sent['heads'], sent['deps']) if oracle_segments: docs.append(Doc(nlp.vocab, words=sent['words'], spaces=sent['spaces'])) golds.append(GoldParse(docs[-1], **sent)) sent_annots.append(sent) if raw_text and max_doc_length and len(sent_annots) >= max_doc_length: doc, gold = _make_gold(nlp, None, sent_annots) sent_annots = [] docs.append(doc) golds.append(gold) if limit and len(docs) >= limit: return docs, golds if raw_text and sent_annots: doc, gold = _make_gold(nlp, None, sent_annots) docs.append(doc) golds.append(gold) if limit and len(docs) >= limit: return docs, golds return docs, golds
def test_get_oracle_actions(): doc = Doc(Vocab(), words=[t[1] for t in annot_tuples]) parser = DependencyParser(doc.vocab) parser.moves.add_action(0, "") parser.moves.add_action(1, "") parser.moves.add_action(1, "") parser.moves.add_action(4, "ROOT") for i, (id_, word, tag, head, dep, ent) in enumerate(annot_tuples): if head > i: parser.moves.add_action(2, dep) elif head < i: parser.moves.add_action(3, dep) ids, words, tags, heads, deps, ents = zip(*annot_tuples) heads, deps = projectivize(heads, deps) gold = GoldParse(doc, words=words, tags=tags, heads=heads, deps=deps) parser.moves.preprocess_gold(gold) parser.moves.get_oracle_sequence(doc, gold)
def test_get_oracle_actions(): doc = Doc(Vocab(), words=[t[1] for t in annot_tuples]) parser = DependencyParser(doc.vocab) parser.moves.add_action(0, "") parser.moves.add_action(1, "") parser.moves.add_action(1, "") parser.moves.add_action(4, "ROOT") for i, (id_, word, tag, head, dep, ent) in enumerate(annot_tuples): if head > i: parser.moves.add_action(2, dep) elif head < i: parser.moves.add_action(3, dep) ids, words, tags, heads, deps, ents = zip(*annot_tuples) heads, deps = projectivize(heads, deps) gold = GoldParse(doc, words=words, tags=tags, heads=heads, deps=deps) parser.moves.preprocess_gold(gold) parser.moves.get_oracle_sequence(doc, gold)
def read_data( nlp, conllu_file, text_file, raw_text=True, oracle_segments=False, max_doc_length=None, limit=None, ): """Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True, include Doc objects created using nlp.make_doc and then aligned against the gold-standard sequences. If oracle_segments=True, include Doc objects created from the gold-standard segments. At least one must be True.""" if not raw_text and not oracle_segments: raise ValueError( "At least one of raw_text or oracle_segments must be True") paragraphs = split_text(text_file.read()) conllu = read_conllu(conllu_file) # sd is spacy doc; cd is conllu doc # cs is conllu sent, ct is conllu token docs = [] golds = [] for doc_id, (text, cd) in enumerate(zip(paragraphs, conllu)): sent_annots = [] for cs in cd: sent = defaultdict(list) for id_, word, lemma, pos, tag, morph, head, dep, _, space_after in cs: if "." in id_: continue if "-" in id_: continue id_ = int(id_) - 1 head = int(head) - 1 if head != "0" else id_ sent["words"].append(word) sent["tags"].append(tag) sent["morphology"].append(_parse_morph_string(morph)) sent["morphology"][-1].add("POS_%s" % pos) sent["heads"].append(head) sent["deps"].append("ROOT" if dep == "root" else dep) sent["spaces"].append(space_after == "_") sent["entities"] = ["-"] * len(sent["words"]) sent["heads"], sent["deps"] = projectivize(sent["heads"], sent["deps"]) if oracle_segments: docs.append( Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"])) golds.append(GoldParse(docs[-1], **sent)) assert golds[-1].morphology is not None sent_annots.append(sent) if raw_text and max_doc_length and len( sent_annots) >= max_doc_length: doc, gold = _make_gold(nlp, None, sent_annots) assert gold.morphology is not None sent_annots = [] docs.append(doc) golds.append(gold) if limit and len(docs) >= limit: return docs, golds if raw_text and sent_annots: doc, gold = _make_gold(nlp, None, sent_annots) docs.append(doc) golds.append(gold) if limit and len(docs) >= limit: return docs, golds return docs, golds
def test_parser_pseudoprojectivity(en_tokenizer): def deprojectivize(proj_heads, deco_labels): tokens = en_tokenizer("whatever " * len(proj_heads)) rel_proj_heads = [head - i for i, head in enumerate(proj_heads)] doc = get_doc( tokens.vocab, words=[t.text for t in tokens], deps=deco_labels, heads=rel_proj_heads, ) nonproj.deprojectivize(doc) return [t.head.i for t in doc], [token.dep_ for token in doc] # fmt: off tree = [1, 2, 2] nonproj_tree = [1, 2, 2, 4, 5, 2, 7, 4, 2] nonproj_tree2 = [9, 1, 3, 1, 5, 6, 9, 8, 6, 1, 6, 12, 13, 10, 1] labels = [ "det", "nsubj", "root", "det", "dobj", "aux", "nsubj", "acl", "punct" ] labels2 = [ "advmod", "root", "det", "nsubj", "advmod", "det", "dobj", "det", "nmod", "aux", "nmod", "advmod", "det", "amod", "punct" ] # fmt: on assert nonproj.decompose("X||Y") == ("X", "Y") assert nonproj.decompose("X") == ("X", "") assert nonproj.is_decorated("X||Y") is True assert nonproj.is_decorated("X") is False nonproj._lift(0, tree) assert tree == [2, 2, 2] assert nonproj._get_smallest_nonproj_arc(nonproj_tree) == 7 assert nonproj._get_smallest_nonproj_arc(nonproj_tree2) == 10 # fmt: off proj_heads, deco_labels = nonproj.projectivize(nonproj_tree, labels) assert proj_heads == [1, 2, 2, 4, 5, 2, 7, 5, 2] assert deco_labels == [ "det", "nsubj", "root", "det", "dobj", "aux", "nsubj", "acl||dobj", "punct" ] deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels) assert deproj_heads == nonproj_tree assert undeco_labels == labels proj_heads, deco_labels = nonproj.projectivize(nonproj_tree2, labels2) assert proj_heads == [1, 1, 3, 1, 5, 6, 9, 8, 6, 1, 9, 12, 13, 10, 1] assert deco_labels == [ "advmod||aux", "root", "det", "nsubj", "advmod", "det", "dobj", "det", "nmod", "aux", "nmod||dobj", "advmod", "det", "amod", "punct" ] deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels) assert deproj_heads == nonproj_tree2 assert undeco_labels == labels2 # if decoration is wrong such that there is no head with the desired label # the structure is kept and the label is undecorated proj_heads = [1, 2, 2, 4, 5, 2, 7, 5, 2] deco_labels = [ "det", "nsubj", "root", "det", "dobj", "aux", "nsubj", "acl||iobj", "punct" ] deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels) assert deproj_heads == proj_heads assert undeco_labels == [ "det", "nsubj", "root", "det", "dobj", "aux", "nsubj", "acl", "punct" ] # if there are two potential new heads, the first one is chosen even if # it"s wrong proj_heads = [1, 1, 3, 1, 5, 6, 9, 8, 6, 1, 9, 12, 13, 10, 1] deco_labels = [ "advmod||aux", "root", "det", "aux", "advmod", "det", "dobj", "det", "nmod", "aux", "nmod||dobj", "advmod", "det", "amod", "punct" ] deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels) assert deproj_heads == [3, 1, 3, 1, 5, 6, 9, 8, 6, 1, 6, 12, 13, 10, 1] assert undeco_labels == [ "advmod", "root", "det", "aux", "advmod", "det", "dobj", "det", "nmod", "aux", "nmod", "advmod", "det", "amod", "punct" ]
def test_parser_pseudoprojectivity(en_tokenizer): def deprojectivize(proj_heads, deco_labels): tokens = en_tokenizer("whatever " * len(proj_heads)) rel_proj_heads = [head - i for i, head in enumerate(proj_heads)] doc = get_doc( tokens.vocab, words=[t.text for t in tokens], deps=deco_labels, heads=rel_proj_heads, ) nonproj.deprojectivize(doc) return [t.head.i for t in doc], [token.dep_ for token in doc] # fmt: off tree = [1, 2, 2] nonproj_tree = [1, 2, 2, 4, 5, 2, 7, 4, 2] nonproj_tree2 = [9, 1, 3, 1, 5, 6, 9, 8, 6, 1, 6, 12, 13, 10, 1] labels = ["det", "nsubj", "root", "det", "dobj", "aux", "nsubj", "acl", "punct"] labels2 = ["advmod", "root", "det", "nsubj", "advmod", "det", "dobj", "det", "nmod", "aux", "nmod", "advmod", "det", "amod", "punct"] # fmt: on assert nonproj.decompose("X||Y") == ("X", "Y") assert nonproj.decompose("X") == ("X", "") assert nonproj.is_decorated("X||Y") is True assert nonproj.is_decorated("X") is False nonproj._lift(0, tree) assert tree == [2, 2, 2] assert nonproj._get_smallest_nonproj_arc(nonproj_tree) == 7 assert nonproj._get_smallest_nonproj_arc(nonproj_tree2) == 10 # fmt: off proj_heads, deco_labels = nonproj.projectivize(nonproj_tree, labels) assert proj_heads == [1, 2, 2, 4, 5, 2, 7, 5, 2] assert deco_labels == ["det", "nsubj", "root", "det", "dobj", "aux", "nsubj", "acl||dobj", "punct"] deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels) assert deproj_heads == nonproj_tree assert undeco_labels == labels proj_heads, deco_labels = nonproj.projectivize(nonproj_tree2, labels2) assert proj_heads == [1, 1, 3, 1, 5, 6, 9, 8, 6, 1, 9, 12, 13, 10, 1] assert deco_labels == ["advmod||aux", "root", "det", "nsubj", "advmod", "det", "dobj", "det", "nmod", "aux", "nmod||dobj", "advmod", "det", "amod", "punct"] deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels) assert deproj_heads == nonproj_tree2 assert undeco_labels == labels2 # if decoration is wrong such that there is no head with the desired label # the structure is kept and the label is undecorated proj_heads = [1, 2, 2, 4, 5, 2, 7, 5, 2] deco_labels = ["det", "nsubj", "root", "det", "dobj", "aux", "nsubj", "acl||iobj", "punct"] deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels) assert deproj_heads == proj_heads assert undeco_labels == ["det", "nsubj", "root", "det", "dobj", "aux", "nsubj", "acl", "punct"] # if there are two potential new heads, the first one is chosen even if # it"s wrong proj_heads = [1, 1, 3, 1, 5, 6, 9, 8, 6, 1, 9, 12, 13, 10, 1] deco_labels = ["advmod||aux", "root", "det", "aux", "advmod", "det", "dobj", "det", "nmod", "aux", "nmod||dobj", "advmod", "det", "amod", "punct"] deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels) assert deproj_heads == [3, 1, 3, 1, 5, 6, 9, 8, 6, 1, 6, 12, 13, 10, 1] assert undeco_labels == ["advmod", "root", "det", "aux", "advmod", "det", "dobj", "det", "nmod", "aux", "nmod", "advmod", "det", "amod", "punct"]
def read_data(nlp, conllu_file, text_file, raw_text=True, oracle_segments=False, limit=None): '''Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True, include Doc objects created using nlp.make_doc and then aligned against the gold-standard sequences. If oracle_segments=True, include Doc objects created from the gold-standard segments. At least one must be True.''' if not raw_text and not oracle_segments: raise ValueError( "At least one of raw_text or oracle_segments must be True") paragraphs = split_text(text_file.read()) conllu = read_conllu(conllu_file) # sd is spacy doc; cd is conllu doc # cs is conllu sent, ct is conllu token docs = [] golds = [] for doc_id, (text, cd) in enumerate(zip(paragraphs, conllu)): doc_words = [] doc_tags = [] doc_heads = [] doc_deps = [] doc_ents = [] for cs in cd: sent_words = [] sent_tags = [] sent_heads = [] sent_deps = [] for id_, word, lemma, pos, tag, morph, head, dep, _1, _2 in cs: if '.' in id_: continue if '-' in id_: continue id_ = int(id_) - 1 head = int(head) - 1 if head != '0' else id_ sent_words.append(word) sent_tags.append(tag) sent_heads.append(head) sent_deps.append('ROOT' if dep == 'root' else dep) if oracle_segments: sent_heads, sent_deps = projectivize(sent_heads, sent_deps) docs.append(Doc(nlp.vocab, words=sent_words)) golds.append( GoldParse(docs[-1], words=sent_words, heads=sent_heads, tags=sent_tags, deps=sent_deps, entities=['-'] * len(sent_words))) for head in sent_heads: doc_heads.append(len(doc_words) + head) doc_words.extend(sent_words) doc_tags.extend(sent_tags) doc_deps.extend(sent_deps) doc_ents.extend(['-'] * len(sent_words)) # Create a GoldParse object for the sentence doc_heads, doc_deps = projectivize(doc_heads, doc_deps) if raw_text: docs.append(nlp.make_doc(text)) golds.append( GoldParse(docs[-1], words=doc_words, tags=doc_tags, heads=doc_heads, deps=doc_deps, entities=doc_ents)) if limit and doc_id >= limit: break return docs, golds