def test_issue4665(): """ conllu_to_docs should not raise an exception if the HEAD column contains an underscore """ input_data = """ 1 [ _ PUNCT -LRB- _ _ punct _ _ 2 This _ DET DT _ _ det _ _ 3 killing _ NOUN NN _ _ nsubj _ _ 4 of _ ADP IN _ _ case _ _ 5 a _ DET DT _ _ det _ _ 6 respected _ ADJ JJ _ _ amod _ _ 7 cleric _ NOUN NN _ _ nmod _ _ 8 will _ AUX MD _ _ aux _ _ 9 be _ AUX VB _ _ aux _ _ 10 causing _ VERB VBG _ _ root _ _ 11 us _ PRON PRP _ _ iobj _ _ 12 trouble _ NOUN NN _ _ dobj _ _ 13 for _ ADP IN _ _ case _ _ 14 years _ NOUN NNS _ _ nmod _ _ 15 to _ PART TO _ _ mark _ _ 16 come _ VERB VB _ _ acl _ _ 17 . _ PUNCT . _ _ punct _ _ 18 ] _ PUNCT -RRB- _ _ punct _ _ """ conllu_to_docs(input_data)
def test_cli_converters_conllu_empty_heads_ner(): """ conllu_to_docs should not raise an exception if the HEAD column contains an underscore """ input_data = """ 1 [ _ PUNCT -LRB- _ _ punct _ _ 2 This _ DET DT _ _ det _ _ 3 killing _ NOUN NN _ _ nsubj _ _ 4 of _ ADP IN _ _ case _ _ 5 a _ DET DT _ _ det _ _ 6 respected _ ADJ JJ _ _ amod _ _ 7 cleric _ NOUN NN _ _ nmod _ _ 8 will _ AUX MD _ _ aux _ _ 9 be _ AUX VB _ _ aux _ _ 10 causing _ VERB VBG _ _ root _ _ 11 us _ PRON PRP _ _ iobj _ _ 12 trouble _ NOUN NN _ _ dobj _ _ 13 for _ ADP IN _ _ case _ _ 14 years _ NOUN NNS _ _ nmod _ _ 15 to _ PART TO _ _ mark _ _ 16 come _ VERB VB _ _ acl _ _ 17 . _ PUNCT . _ _ punct _ _ 18 ] _ PUNCT -RRB- _ _ punct _ _ """ docs = list(conllu_to_docs(input_data)) # heads are all 0 assert not all([t.head.i for t in docs[0]]) # NER is unset assert not docs[0].has_annotation("ENT_IOB")
def test_cli_converters_conllu_to_docs_name_ner_map(lines): input_data = "\n".join(lines) converted_docs = list( conllu_to_docs(input_data, n_sents=1, ner_map={ "PER": "PERSON", "BAD": "" })) assert len(converted_docs) == 1 converted = [docs_to_json(converted_docs)] assert converted[0]["id"] == 0 assert len(converted[0]["paragraphs"]) == 1 assert converted[0]["paragraphs"][0][ "raw"] == "Dommer FinnEilertsen avstår. " assert len(converted[0]["paragraphs"][0]["sentences"]) == 1 sent = converted[0]["paragraphs"][0]["sentences"][0] assert len(sent["tokens"]) == 5 tokens = sent["tokens"] assert [t["orth"] for t in tokens] == ["Dommer", "Finn", "Eilertsen", "avstår", "."] assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB", "PUNCT"] assert [t["head"] for t in tokens] == [1, 2, -1, 0, -1] assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT", "punct"] ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]] biluo_tags = offsets_to_biluo_tags(converted_docs[0], ent_offsets, missing="O") assert biluo_tags == ["O", "B-PERSON", "L-PERSON", "O", "O"]
def test_cli_converters_conllu_to_docs(): # from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu lines = [ "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tO", "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tB-PER", "3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tI-PER", "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tO", ] input_data = "\n".join(lines) converted_docs = list(conllu_to_docs(input_data, n_sents=1)) assert len(converted_docs) == 1 converted = [docs_to_json(converted_docs)] assert converted[0]["id"] == 0 assert len(converted[0]["paragraphs"]) == 1 assert len(converted[0]["paragraphs"][0]["sentences"]) == 1 sent = converted[0]["paragraphs"][0]["sentences"][0] assert len(sent["tokens"]) == 4 tokens = sent["tokens"] assert [t["orth"] for t in tokens] == ["Dommer", "Finn", "Eilertsen", "avstår"] assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB"] assert [t["head"] for t in tokens] == [1, 2, -1, 0] assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT"] ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]] biluo_tags = offsets_to_biluo_tags(converted_docs[0], ent_offsets, missing="O") assert biluo_tags == ["O", "B-PER", "L-PER", "O"]
def test_cli_converters_conllu_to_docs_subtokens(): # https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu lines = [ "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O", "2-3\tFE\t_\t_\t_\t_\t_\t_\t_\t_", "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tname=B-PER", "3\tEilertsen\tEilertsen\tX\t_\tGender=Fem|Tense=past\t2\tname\t_\tname=I-PER", "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No|name=O", "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O", ] input_data = "\n".join(lines) converted_docs = list( conllu_to_docs(input_data, n_sents=1, merge_subtokens=True, append_morphology=True)) assert len(converted_docs) == 1 converted = [docs_to_json(converted_docs)] assert converted[0]["id"] == 0 assert len(converted[0]["paragraphs"]) == 1 assert converted[0]["paragraphs"][0]["raw"] == "Dommer FE avstår. " assert len(converted[0]["paragraphs"][0]["sentences"]) == 1 sent = converted[0]["paragraphs"][0]["sentences"][0] assert len(sent["tokens"]) == 4 tokens = sent["tokens"] print(tokens) assert [t["orth"] for t in tokens] == ["Dommer", "FE", "avstår", "."] assert [t["tag"] for t in tokens] == [ "NOUN__Definite=Ind|Gender=Masc|Number=Sing", "PROPN_X__Gender=Fem,Masc|Tense=past", "VERB__Mood=Ind|Tense=Pres|VerbForm=Fin", "PUNCT", ] assert [t["pos"] for t in tokens] == ["NOUN", "PROPN", "VERB", "PUNCT"] assert [t["morph"] for t in tokens] == [ "Definite=Ind|Gender=Masc|Number=Sing", "Gender=Fem,Masc|Tense=past", "Mood=Ind|Tense=Pres|VerbForm=Fin", "", ] assert [t["lemma"] for t in tokens] == ["dommer", "Finn Eilertsen", "avstå", "$."] assert [t["head"] for t in tokens] == [1, 1, 0, -1] assert [t["dep"] for t in tokens] == ["appos", "nsubj", "ROOT", "punct"] ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]] biluo_tags = offsets_to_biluo_tags(converted_docs[0], ent_offsets, missing="O") assert biluo_tags == ["O", "U-PER", "O", "O"]