示例#1
0
def test_spacy_udpipe(lang):
    nlp = load(lang)
    assert nlp._meta["lang"] == "udpipe_" + lang

    text = "Testing one, two, three. This is a test."
    doc = nlp(text)

    pos_actual = ['VERB', 'NUM', 'PUNCT', 'NUM', 'PUNCT', 'NUM', 'PUNCT',
                  ('PRON', 'DET'), ('AUX', 'VERB'), 'DET', 'NOUN', 'PUNCT']
    # test token attributes
    assert [t.text for t in doc] == ['Testing', 'one', ',', 'two', ',', 'three', '.',
                                     'This', 'is', 'a', 'test', '.']
    assert [t.lemma_ for t in doc] == ['test', 'one', ',', 'two', ',', 'three', '.',
                                       'this', 'be', 'a', 'test', '.']
    assert tags_equal([t.pos_ for t in doc], pos_actual)
    assert [t.tag_ for t in doc] == ['V', 'N', 'FF', 'N', 'FF', 'N', 'FS',
                                     'PD', 'V', 'RI', 'S', 'FS']  # CoNNL xpostag-s, custom for each UD treebank
    assert [t.dep_ for t in doc] == ['ROOT', 'nummod', 'punct', 'nummod', 'punct', 'nummod', 'punct',
                                     'nsubj', 'cop', 'det', 'ROOT', 'punct']
    assert [t.is_sent_start for t in doc] == [True, None, None, None, None, None, None,
                                              True, None, None, None, None]
    assert any([t.is_stop for t in doc])
    # test doc attributes
    assert len(list(doc.sents)) == 2
    assert doc.is_tagged
    assert doc.is_parsed
    assert doc.is_sentenced
    # test pipe
    docs = list(nlp.pipe(["Testing one, two, three.", "This is a test."]))
    assert docs[0].text == "Testing one, two, three."
    assert [t.pos_ for t in docs[0]] == [
        'VERB', 'NUM', 'PUNCT', 'NUM', 'PUNCT', 'NUM', 'PUNCT']
    assert docs[1].text == "This is a test."
    assert tags_equal([t.pos_ for t in docs[1]], pos_actual[-5:])
def test_spacy_udpipe_presegmented(lang: str) -> None:
    nlp = load(lang=lang)
    assert nlp._meta["lang"] == f"udpipe_{lang}"

    text = "Testing one, two, three. This is a test."
    doc = nlp(text=text)
    doc_json = doc.to_json()

    text_pre = ["Testing one, two, three.", "This is a test."]
    doc_pre = nlp(text=text_pre)
    doc_pre_json = doc_pre.to_json()

    assert doc_json["text"] == doc_pre_json["text"]
    assert doc_json["sents"] == doc_pre_json["sents"]
    assert doc_json["tokens"] == doc_pre_json["tokens"]
def test_spacy_udpipe_pretokenized(lang: str) -> None:
    nlp = load(lang=lang)
    assert nlp._meta["lang"] == f"udpipe_{lang}"

    text = "Testing one, two, three. This is a test."
    doc = nlp(text=text)
    doc_json = doc.to_json()

    text_pre = [["Testing", "one", ",", "two", ",", "three", "."],
                ["This", "is", "a", "test", "."]]
    doc_pre = nlp(text=text_pre)
    doc_pre_json = doc_pre.to_json()

    assert doc_json["text"] == doc_pre_json["text"]
    assert doc_json["sents"] == doc_pre_json["sents"]
    assert doc_json["tokens"] == doc_pre_json["tokens"]
def test_spacy_udpipe(lang: str) -> None:
    nlp = load(lang=lang)
    assert nlp._meta["lang"] == f"udpipe_{lang}"

    text = "Attention aux articles contractés!"
    doc = nlp(text=text)

    assert [t.orth_ for t in doc
            ] == ["Attention", "à", "les", "articles", "contractés", "!"]

    pos = [{"INTJ", "NOUN"}, {"ADP"}, {"DET"}, {"NOUN"}, {"VERB", "ADJ"},
           {"PUNCT"}]
    for i, t in enumerate(doc):
        assert t.pos_ in pos[i]

    assert [t.head.i for t in doc] == [0, 3, 3, 0, 3, 0]

    dep = [{"ROOT", "root"}, {"case"}, {"det"}, {"nmod", "obl", "obl:arg"},
           {"acl", "amod"}, {"punct"}]
    for i, t in enumerate(doc):
        assert t.dep_ in dep[i]
def test_spacy_udpipe_default(lang: str) -> None:
    nlp = load(lang=lang)
    assert nlp._meta["lang"] == f"udpipe_{lang}"

    text = "Testing one, two, three. This is a test."
    doc = nlp(text=text)

    pos_actual = [
        "PROPN", "NUM", "PUNCT", "NUM", "PUNCT", "NUM", "PUNCT", "PRON", "AUX",
        "DET", "NOUN", "PUNCT"
    ]
    # test token attributes
    assert [t.text for t in doc] == [
        "Testing",
        "one",
        ",",
        "two",
        ",",
        "three",  # noqa: E501
        ".",
        "This",
        "is",
        "a",
        "test",
        "."
    ]
    assert [t.lemma_ for t in doc] == [
        "test", "one", ",", "two", ",", "three", ".", "this", "be", "a",
        "test", "."
    ]
    assert tags_equal(act=pos_actual, exp=[t.pos_ for t in doc])
    # CoNNL xpostag-s, custom for each UD treebank
    assert [t.tag_ for t in doc] == [
        "NNP", "CD", ",", "CD", ",", "CD", ".", "DT", "VBZ", "DT", "NN", "."
    ]
    assert [t.dep_ for t in doc] == [
        "ROOT",
        "nummod",
        "punct",
        "appos",
        "punct",
        "nummod",  # noqa: E501
        "punct",
        "nsubj",
        "cop",
        "det",
        "ROOT",
        "punct"
    ]
    assert [t.is_sent_start for t in doc] == [
        True,
        None,
        None,
        None,
        None,
        None,
        None,  # noqa: E501
        True,
        None,
        None,
        None,
        None
    ]
    assert any([t.is_stop for t in doc])
    # test doc attributes
    assert len(list(doc.sents)) == 2
    assert doc.is_tagged
    assert doc.is_parsed
    assert doc.is_sentenced
    # test pipe
    docs = list(nlp.pipe(["Testing one, two, three.", "This is a test."]))
    assert docs[0].text == "Testing one, two, three."
    assert [t.pos_ for t in docs[0]
            ] == ["PROPN", "NUM", "PUNCT", "NUM", "PUNCT", "NUM",
                  "PUNCT"]  # noqa: E501
    assert docs[1].text == "This is a test."
    assert tags_equal(act=pos_actual[-5:], exp=[t.pos_ for t in docs[1]])