def test_spacy_udpipe(lang): nlp = load(lang) assert nlp._meta["lang"] == "udpipe_" + lang text = "Testing one, two, three. This is a test." doc = nlp(text) pos_actual = ['VERB', 'NUM', 'PUNCT', 'NUM', 'PUNCT', 'NUM', 'PUNCT', ('PRON', 'DET'), ('AUX', 'VERB'), 'DET', 'NOUN', 'PUNCT'] # test token attributes assert [t.text for t in doc] == ['Testing', 'one', ',', 'two', ',', 'three', '.', 'This', 'is', 'a', 'test', '.'] assert [t.lemma_ for t in doc] == ['test', 'one', ',', 'two', ',', 'three', '.', 'this', 'be', 'a', 'test', '.'] assert tags_equal([t.pos_ for t in doc], pos_actual) assert [t.tag_ for t in doc] == ['V', 'N', 'FF', 'N', 'FF', 'N', 'FS', 'PD', 'V', 'RI', 'S', 'FS'] # CoNNL xpostag-s, custom for each UD treebank assert [t.dep_ for t in doc] == ['ROOT', 'nummod', 'punct', 'nummod', 'punct', 'nummod', 'punct', 'nsubj', 'cop', 'det', 'ROOT', 'punct'] assert [t.is_sent_start for t in doc] == [True, None, None, None, None, None, None, True, None, None, None, None] assert any([t.is_stop for t in doc]) # test doc attributes assert len(list(doc.sents)) == 2 assert doc.is_tagged assert doc.is_parsed assert doc.is_sentenced # test pipe docs = list(nlp.pipe(["Testing one, two, three.", "This is a test."])) assert docs[0].text == "Testing one, two, three." assert [t.pos_ for t in docs[0]] == [ 'VERB', 'NUM', 'PUNCT', 'NUM', 'PUNCT', 'NUM', 'PUNCT'] assert docs[1].text == "This is a test." assert tags_equal([t.pos_ for t in docs[1]], pos_actual[-5:])
def test_spacy_udpipe_presegmented(lang: str) -> None: nlp = load(lang=lang) assert nlp._meta["lang"] == f"udpipe_{lang}" text = "Testing one, two, three. This is a test." doc = nlp(text=text) doc_json = doc.to_json() text_pre = ["Testing one, two, three.", "This is a test."] doc_pre = nlp(text=text_pre) doc_pre_json = doc_pre.to_json() assert doc_json["text"] == doc_pre_json["text"] assert doc_json["sents"] == doc_pre_json["sents"] assert doc_json["tokens"] == doc_pre_json["tokens"]
def test_spacy_udpipe_pretokenized(lang: str) -> None: nlp = load(lang=lang) assert nlp._meta["lang"] == f"udpipe_{lang}" text = "Testing one, two, three. This is a test." doc = nlp(text=text) doc_json = doc.to_json() text_pre = [["Testing", "one", ",", "two", ",", "three", "."], ["This", "is", "a", "test", "."]] doc_pre = nlp(text=text_pre) doc_pre_json = doc_pre.to_json() assert doc_json["text"] == doc_pre_json["text"] assert doc_json["sents"] == doc_pre_json["sents"] assert doc_json["tokens"] == doc_pre_json["tokens"]
def test_spacy_udpipe(lang: str) -> None: nlp = load(lang=lang) assert nlp._meta["lang"] == f"udpipe_{lang}" text = "Attention aux articles contractés!" doc = nlp(text=text) assert [t.orth_ for t in doc ] == ["Attention", "à", "les", "articles", "contractés", "!"] pos = [{"INTJ", "NOUN"}, {"ADP"}, {"DET"}, {"NOUN"}, {"VERB", "ADJ"}, {"PUNCT"}] for i, t in enumerate(doc): assert t.pos_ in pos[i] assert [t.head.i for t in doc] == [0, 3, 3, 0, 3, 0] dep = [{"ROOT", "root"}, {"case"}, {"det"}, {"nmod", "obl", "obl:arg"}, {"acl", "amod"}, {"punct"}] for i, t in enumerate(doc): assert t.dep_ in dep[i]
def test_spacy_udpipe_default(lang: str) -> None: nlp = load(lang=lang) assert nlp._meta["lang"] == f"udpipe_{lang}" text = "Testing one, two, three. This is a test." doc = nlp(text=text) pos_actual = [ "PROPN", "NUM", "PUNCT", "NUM", "PUNCT", "NUM", "PUNCT", "PRON", "AUX", "DET", "NOUN", "PUNCT" ] # test token attributes assert [t.text for t in doc] == [ "Testing", "one", ",", "two", ",", "three", # noqa: E501 ".", "This", "is", "a", "test", "." ] assert [t.lemma_ for t in doc] == [ "test", "one", ",", "two", ",", "three", ".", "this", "be", "a", "test", "." ] assert tags_equal(act=pos_actual, exp=[t.pos_ for t in doc]) # CoNNL xpostag-s, custom for each UD treebank assert [t.tag_ for t in doc] == [ "NNP", "CD", ",", "CD", ",", "CD", ".", "DT", "VBZ", "DT", "NN", "." ] assert [t.dep_ for t in doc] == [ "ROOT", "nummod", "punct", "appos", "punct", "nummod", # noqa: E501 "punct", "nsubj", "cop", "det", "ROOT", "punct" ] assert [t.is_sent_start for t in doc] == [ True, None, None, None, None, None, None, # noqa: E501 True, None, None, None, None ] assert any([t.is_stop for t in doc]) # test doc attributes assert len(list(doc.sents)) == 2 assert doc.is_tagged assert doc.is_parsed assert doc.is_sentenced # test pipe docs = list(nlp.pipe(["Testing one, two, three.", "This is a test."])) assert docs[0].text == "Testing one, two, three." assert [t.pos_ for t in docs[0] ] == ["PROPN", "NUM", "PUNCT", "NUM", "PUNCT", "NUM", "PUNCT"] # noqa: E501 assert docs[1].text == "This is a test." assert tags_equal(act=pos_actual[-5:], exp=[t.pos_ for t in docs[1]])