Пример #1
0
def test_issue1915():
    cfg = {"hidden_depth": 2}  # should error out
    nlp = Language()
    ner = nlp.add_pipe("ner")
    ner.add_label("answer")
    with pytest.raises(ValueError):
        nlp.initialize(**cfg)
Пример #2
0
def test_implicit_labels():
    nlp = Language()
    spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
    assert len(spancat.labels) == 0
    train_examples = make_examples(nlp)
    nlp.initialize(get_examples=lambda: train_examples)
    assert spancat.labels == ("PERSON", "LOC")
Пример #3
0
def test_implicit_label():
    nlp = Language()
    nlp.add_pipe("tagger")
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    nlp.initialize(get_examples=lambda: train_examples)
Пример #4
0
def nlp():
    nlp = Language(Vocab())
    textcat = nlp.add_pipe("textcat")
    for label in ("POSITIVE", "NEGATIVE"):
        textcat.add_label(label)
    nlp.initialize()
    return nlp
Пример #5
0
def test_explicit_labels():
    nlp = Language()
    spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
    assert len(spancat.labels) == 0
    spancat.add_label("PERSON")
    spancat.add_label("LOC")
    nlp.initialize()
    assert spancat.labels == ("PERSON", "LOC")
Пример #6
0
def test_error_with_multi_labels():
    nlp = Language()
    nlp.add_pipe("textcat")
    train_examples = []
    for text, annotations in TRAIN_DATA_MULTI_LABEL:
        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
    with pytest.raises(ValueError):
        nlp.initialize(get_examples=lambda: train_examples)
Пример #7
0
def test_ner_labels_added_implicitly_on_greedy_parse():
    nlp = Language()
    ner = nlp.add_pipe("beam_ner")
    for label in ["A", "B", "C"]:
        ner.add_label(label)
    nlp.initialize()
    doc = Doc(nlp.vocab, words=["hello", "world"], ents=["B-D", "O"])
    ner.greedy_parse([doc])
    assert "D" in ner.labels
Пример #8
0
def test_implicit_labels():
    nlp = Language()
    spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
    assert len(spancat.labels) == 0
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    nlp.initialize(get_examples=lambda: train_examples)
    assert spancat.labels == ("PERSON", "LOC")
def test_no_resize():
    nlp = Language()
    morphologizer = nlp.add_pipe("morphologizer")
    morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN")
    morphologizer.add_label("POS" + Morphology.FIELD_SEP + "VERB")
    nlp.initialize()
    # this throws an error because the morphologizer can't be resized after initialization
    with pytest.raises(ValueError):
        morphologizer.add_label("POS" + Morphology.FIELD_SEP + "ADJ")
Пример #10
0
def test_tagger_initialize_tag_map():
    """Test that Tagger.initialize() without gold tuples does not clobber
    the tag map."""
    nlp = Language()
    tagger = nlp.add_pipe("tagger")
    orig_tag_count = len(tagger.labels)
    tagger.add_label("A")
    nlp.initialize()
    assert orig_tag_count + 1 == len(nlp.get_pipe("tagger").labels)
Пример #11
0
def tagger():
    nlp = Language()
    tagger = nlp.add_pipe("tagger")
    # need to add model for two reasons:
    # 1. no model leads to error in serialization,
    # 2. the affected line is the one for model serialization
    tagger.add_label("A")
    nlp.initialize()
    return tagger
Пример #12
0
def test_no_resize(name):
    nlp = Language()
    textcat = nlp.add_pipe(name)
    textcat.add_label("POSITIVE")
    textcat.add_label("NEGATIVE")
    nlp.initialize()
    assert textcat.model.get_dim("nO") >= 2
    # this throws an error because the textcat can't be resized after initialization
    with pytest.raises(ValueError):
        textcat.add_label("NEUTRAL")
Пример #13
0
def test_ner_labels_added_implicitly_on_predict():
    nlp = Language()
    ner = nlp.add_pipe("ner")
    for label in ["A", "B", "C"]:
        ner.add_label(label)
    nlp.initialize()
    doc = Doc(nlp.vocab, words=["hello", "world"], ents=["B-D", "O"])
    ner(doc)
    assert [t.ent_type_ for t in doc] == ["D", ""]
    assert "D" in ner.labels
Пример #14
0
def test_ner_labels_added_implicitly_on_update():
    nlp = Language()
    ner = nlp.add_pipe("ner")
    for label in ["A", "B", "C"]:
        ner.add_label(label)
    nlp.initialize()
    doc = Doc(nlp.vocab, words=["hello", "world"], ents=["B-D", "O"])
    example = Example(nlp.make_doc(doc.text), doc)
    assert "D" not in ner.labels
    nlp.update([example])
    assert "D" in ner.labels
Пример #15
0
def test_no_resize():
    nlp = Language()
    spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
    spancat.add_label("Thing")
    spancat.add_label("Phrase")
    assert spancat.labels == ("Thing", "Phrase")
    nlp.initialize()
    assert spancat.model.get_dim("nO") == 2
    # this throws an error because the spancat can't be resized after initialization
    with pytest.raises(ValueError):
        spancat.add_label("Stuff")
Пример #16
0
def test_issue2564():
    """Test the tagger sets has_annotation("TAG") correctly when used via Language.pipe."""
    nlp = Language()
    tagger = nlp.add_pipe("tagger")
    tagger.add_label("A")
    nlp.initialize()
    doc = nlp("hello world")
    assert doc.has_annotation("TAG")
    docs = nlp.pipe(["hello", "world"])
    piped_doc = next(docs)
    assert piped_doc.has_annotation("TAG")
Пример #17
0
def test_no_resize():
    nlp = Language()
    tagger = nlp.add_pipe("tagger")
    tagger.add_label("N")
    tagger.add_label("V")
    assert tagger.labels == ("N", "V")
    nlp.initialize()
    assert tagger.model.get_dim("nO") == 2
    # this throws an error because the tagger can't be resized after initialization
    with pytest.raises(ValueError):
        tagger.add_label("J")
Пример #18
0
def test_no_resize(name, textcat_config):
    """The old textcat architectures weren't resizable"""
    nlp = Language()
    pipe_config = {"model": textcat_config}
    textcat = nlp.add_pipe(name, config=pipe_config)
    textcat.add_label("POSITIVE")
    textcat.add_label("NEGATIVE")
    nlp.initialize()
    assert textcat.model.maybe_get_dim("nO") in [2, None]
    # this throws an error because the textcat can't be resized after initialization
    with pytest.raises(ValueError):
        textcat.add_label("NEUTRAL")
Пример #19
0
def test_label_types(name):
    nlp = Language()
    textcat = nlp.add_pipe(name)
    textcat.add_label("answer")
    with pytest.raises(ValueError):
        textcat.add_label(9)
    # textcat requires at least two labels
    if name == "textcat":
        with pytest.raises(ValueError):
            nlp.initialize()
    else:
        nlp.initialize()
Пример #20
0
def test_issue9904():
    nlp = Language()
    textcat = nlp.add_pipe("textcat")
    get_examples = make_get_examples_single_label(nlp)
    nlp.initialize(get_examples)

    examples = get_examples()
    scores = textcat.predict([eg.predicted for eg in examples])

    loss = textcat.get_loss(examples, scores)[0]
    loss_double_bs = textcat.get_loss(examples * 2, scores.repeat(2, axis=0))[0]
    assert loss == pytest.approx(loss_double_bs)
Пример #21
0
def test_resize(name, textcat_config):
    """The new textcat architectures are resizable"""
    nlp = Language()
    pipe_config = {"model": textcat_config}
    textcat = nlp.add_pipe(name, config=pipe_config)
    textcat.add_label("POSITIVE")
    textcat.add_label("NEGATIVE")
    assert textcat.model.maybe_get_dim("nO") in [2, None]
    nlp.initialize()
    assert textcat.model.maybe_get_dim("nO") in [2, None]
    textcat.add_label("NEUTRAL")
    assert textcat.model.maybe_get_dim("nO") in [3, None]
Пример #22
0
def test_doc_gc():
    # If the Doc object is garbage collected, the spans won't be functional afterwards
    nlp = Language()
    spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
    spancat.add_label("PERSON")
    nlp.initialize()
    texts = ["Just a sentence.", "I like London and Berlin", "I like Berlin", "I eat ham."]
    all_spans = [doc.spans for doc in nlp.pipe(texts)]
    for text, spangroups in zip(texts, all_spans):
        assert isinstance(spangroups, SpanGroups)
        for key, spangroup in spangroups.items():
            assert isinstance(spangroup, SpanGroup)
            assert len(spangroup) > 0
            with pytest.raises(RuntimeError):
                span = spangroup[0]
Пример #23
0
def entity_linker():
    nlp = Language()

    def create_kb(vocab):
        kb = KnowledgeBase(vocab, entity_vector_length=1)
        kb.add_entity("test", 0.0, zeros((1, 1), dtype="f"))
        return kb

    entity_linker = nlp.add_pipe("entity_linker")
    entity_linker.set_kb(create_kb)
    # need to add model for two reasons:
    # 1. no model leads to error in serialization,
    # 2. the affected line is the one for model serialization
    nlp.initialize()
    return entity_linker
Пример #24
0
def test_initialize_examples():
    nlp = Language()
    lemmatizer = nlp.add_pipe("trainable_lemmatizer")
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    # you shouldn't really call this more than once, but for testing it should be fine
    nlp.initialize(get_examples=lambda: train_examples)
    with pytest.raises(TypeError):
        nlp.initialize(get_examples=lambda: None)
    with pytest.raises(TypeError):
        nlp.initialize(get_examples=lambda: train_examples[0])
    with pytest.raises(TypeError):
        nlp.initialize(get_examples=lambda: [])
    with pytest.raises(TypeError):
        nlp.initialize(get_examples=train_examples)
Пример #25
0
def test_initialize_from_labels():
    nlp = Language()
    lemmatizer = nlp.add_pipe("trainable_lemmatizer")
    lemmatizer.min_tree_freq = 1
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    nlp.initialize(get_examples=lambda: train_examples)

    nlp2 = Language()
    lemmatizer2 = nlp2.add_pipe("trainable_lemmatizer")
    lemmatizer2.initialize(
        get_examples=lambda: train_examples,
        labels=lemmatizer.label_data,
    )
    assert lemmatizer2.tree2label == {1: 0, 3: 1, 4: 2, 6: 3}
Пример #26
0
def test_simple_train():
    nlp = Language()
    textcat = nlp.add_pipe("textcat")
    textcat.add_label("answer")
    nlp.initialize()
    for i in range(5):
        for text, answer in [
            ("aaaa", 1.0),
            ("bbbb", 0),
            ("aa", 1.0),
            ("bbbbbbbbb", 0.0),
            ("aaaaaa", 1),
        ]:
            nlp.update((text, {"cats": {"answer": answer}}))
    doc = nlp("aaa")
    assert "answer" in doc.cats
    assert doc.cats["answer"] >= 0.5
Пример #27
0
def test_language_source_and_vectors(nlp2):
    nlp = Language(Vocab())
    textcat = nlp.add_pipe("textcat")
    for label in ("POSITIVE", "NEGATIVE"):
        textcat.add_label(label)
    nlp.initialize()
    long_string = "thisisalongstring"
    assert long_string not in nlp.vocab.strings
    assert long_string not in nlp2.vocab.strings
    nlp.vocab.strings.add(long_string)
    assert nlp.vocab.vectors.to_bytes() != nlp2.vocab.vectors.to_bytes()
    vectors_bytes = nlp.vocab.vectors.to_bytes()
    with pytest.warns(UserWarning):
        nlp2.add_pipe("textcat", name="textcat2", source=nlp)
    # strings should be added
    assert long_string in nlp2.vocab.strings
    # vectors should remain unmodified
    assert nlp.vocab.vectors.to_bytes() == vectors_bytes
Пример #28
0
def test_simple_train():
    fix_random_seed(0)
    nlp = Language()
    spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
    get_examples = make_get_examples(nlp)
    nlp.initialize(get_examples)
    sgd = nlp.create_optimizer()
    assert len(spancat.labels) != 0
    for i in range(40):
        losses = {}
        nlp.update(list(get_examples()), losses=losses, drop=0.1, sgd=sgd)
    doc = nlp("I like London and Berlin.")
    assert doc.spans[spancat.key] == doc.spans[SPAN_KEY]
    assert len(doc.spans[spancat.key]) == 2
    assert doc.spans[spancat.key][0].text == "London"
    scores = nlp.evaluate(get_examples())
    assert f"spans_{SPAN_KEY}_f" in scores
    assert scores[f"spans_{SPAN_KEY}_f"] == 1.0
Пример #29
0
def test_set_candidates():
    nlp = Language()
    spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
    train_examples = make_examples(nlp)
    nlp.initialize(get_examples=lambda: train_examples)
    texts = [
        "Just a sentence.",
        "I like London and Berlin",
        "I like Berlin",
        "I eat ham.",
    ]

    docs = [nlp(text) for text in texts]
    spancat.set_candidates(docs)

    assert len(docs) == len(texts)
    assert type(docs[0].spans["candidates"]) == SpanGroup
    assert len(docs[0].spans["candidates"]) == 9
    assert docs[0].spans["candidates"][0].text == "Just"
    assert docs[0].spans["candidates"][4].text == "Just a"
Пример #30
0
def test_initialize_examples(name, get_examples, train_data):
    nlp = Language()
    textcat = nlp.add_pipe(name)
    for text, annotations in train_data:
        for label, value in annotations.get("cats").items():
            textcat.add_label(label)
    # you shouldn't really call this more than once, but for testing it should be fine
    nlp.initialize()
    nlp.initialize(get_examples=get_examples(nlp))
    with pytest.raises(TypeError):
        nlp.initialize(get_examples=lambda: None)
    with pytest.raises(TypeError):
        nlp.initialize(get_examples=get_examples())