Пример #1
0
def test_transformer_pipeline_todisk_settings():
    nlp = English()
    trf = nlp.add_pipe("transformer", config=DEFAULT_CONFIG)
    nlp.initialize()
    # initially no attentions
    assert trf.model.tokenizer.model_max_length == 512
    assert trf.model.transformer.config.output_attentions is False
    assert "attentions" not in nlp("test")._.trf_data.model_output
    # modify model_max_length (note that modifications to
    # tokenizer.model_max_length are not serialized by save_pretrained
    # see: https://github.com/explosion/spaCy/discussions/7393)
    trf.model.tokenizer.init_kwargs["model_max_length"] = 499
    # add attentions on-the-fly
    trf.model.transformer.config.output_attentions = True
    assert nlp("test")._.trf_data.model_output.attentions is not None
    with make_tempdir() as d:
        nlp.to_disk(d)
        nlp2 = spacy.load(d)
        assert nlp2.pipe_names == ["transformer"]
        trf2 = nlp2.get_pipe("transformer")
        # model_max_length is preserved
        assert trf2.model.tokenizer.model_max_length == 499
        # output_attentions setting is preserved
        assert trf2.model.transformer.config.output_attentions is True
        assert nlp2("test")._.trf_data.model_output.attentions is not None
        # the init configs are empty SimpleFrozenDicts
        assert trf2.model._init_tokenizer_config == {}
        with pytest.raises(NotImplementedError):
            trf2.model._init_tokenizer_config["use_fast"] = False
def test_transformer_pipeline_textcat():
    """Test that a pipeline with just a transformer+textcat runs and trains properly.
    This used to throw an error because of shape inference issues -
    cf https://github.com/explosion/spaCy/issues/6401"""
    orig_config = Config().from_str(cfg_string)
    nlp = util.load_model_from_config(orig_config,
                                      auto_fill=True,
                                      validate=True)
    assert nlp.pipe_names == ["transformer", "textcat"]
    train_examples = []

    for text, annotations in TRAIN_DATA:
        train_examples.append(
            Example.from_dict(nlp.make_doc(text), annotations))
    optimizer = nlp.initialize(get_examples=lambda: train_examples)

    for i in range(2):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)

    doc = nlp("We're interested at underwater basket weaving.")
    cats1 = doc.cats

    # ensure IO goes OK
    with make_tempdir() as d:
        file_path = d / "trained_nlp"
        nlp.to_disk(file_path)
        nlp2 = spacy.load(file_path)
        doc2 = nlp2("We're interested at underwater basket weaving.")
        cats2 = doc2.cats
        assert cats1 == cats2
Пример #3
0
def test_transformer_pipeline_todisk():
    nlp = English()
    nlp.add_pipe("transformer", config=DEFAULT_CONFIG)
    nlp.initialize()
    with make_tempdir() as d:
        nlp.to_disk(d)
        nlp2 = spacy.load(d)
        assert nlp2.pipe_names == ["transformer"]
Пример #4
0
def test_initialized_transformer_todisk():
    nlp = Language()
    trf = nlp.add_pipe("transformer", config=DEFAULT_CONFIG)
    nlp.initialize()
    with make_tempdir() as d:
        trf.to_disk(d)
        nlp2 = Language()
        trf2 = nlp2.add_pipe("transformer", config=DEFAULT_CONFIG)
        trf2.from_disk(d)
Пример #5
0
def test_transformer_pipeline_todisk_before_initialize():
    nlp = English()
    trf = nlp.add_pipe("transformer", config=DEFAULT_CONFIG)
    with make_tempdir() as d:
        # serialize before initialization
        nlp.to_disk(d)
        nlp2 = spacy.load(d)
        nlp2.initialize()
        assert "last_hidden_state" in nlp2("test")._.trf_data.model_output
def test_transformer_pipeline_tagger_senter_listener():
    """Test that a pipeline with just a transformer+tagger+senter runs and
    trains properly"""
    orig_config = Config().from_str(cfg_string)
    nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
    assert nlp.pipe_names == ["transformer", "tagger", "senter"]
    tagger = nlp.get_pipe("tagger")
    transformer = nlp.get_pipe("transformer")
    tagger_trf = tagger.model.get_ref("tok2vec").layers[0]
    assert isinstance(transformer, Transformer)
    assert isinstance(tagger_trf, TransformerListener)
    assert tagger_trf.upstream_name == "custom_upstream"
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
        for tag in t[1]["tags"]:
            tagger.add_label(tag)

    # Check that the Transformer component finds it listeners
    assert transformer.listeners == []
    optimizer = nlp.initialize(lambda: train_examples)
    assert tagger_trf in transformer.listeners

    for i in range(2):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)

    text = "We're interested at underwater basket weaving."
    doc = nlp(text)
    doc_tensor = tagger_trf.predict([doc])
    _assert_equal_tensors(doc._.trf_data.tensors, doc_tensor[0].tensors)

    # ensure IO goes OK
    with make_tempdir() as d:
        file_path = d / "trained_nlp"
        nlp.to_disk(file_path)
        nlp2 = util.load_model_from_path(file_path)
        doc2 = nlp2(text)
        tagger2 = nlp2.get_pipe("tagger")
        tagger_trf2 = tagger2.model.get_ref("tok2vec").layers[0]
        doc_tensor2 = tagger_trf2.predict([doc2])
        _assert_equal_tensors(doc_tensor2[0].tensors, doc_tensor[0].tensors)

        # make sure that this can be saved to directory once more
        file_path_2 = d / "trained_nlp_2"
        nlp2.to_disk(file_path_2)

    # ensure to_bytes / from_bytes works
    nlp_bytes = nlp.to_bytes()
    nlp3 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
    nlp3.from_bytes(nlp_bytes)
    doc3 = nlp3(text)
    tagger3 = nlp3.get_pipe("tagger")
    tagger_trf3 = tagger3.model.get_ref("tok2vec").layers[0]
    doc_tensor3 = tagger_trf3.predict([doc3])
    _assert_equal_tensors(doc_tensor3[0].tensors, doc_tensor[0].tensors)
Пример #7
0
def test_inline_transformer_pipeline_todisk():
    orig_config = Config().from_str(inline_cfg_string)
    nlp = util.load_model_from_config(orig_config,
                                      auto_fill=True,
                                      validate=True)
    assert nlp.pipe_names == ["tagger"]
    with make_tempdir() as d:
        nlp.to_disk(d)
        nlp2 = spacy.load(d)
        assert nlp2.pipe_names == ["tagger"]
Пример #8
0
def test_overfitting_IO():
    nlp = English()
    lemmatizer = nlp.add_pipe("trainable_lemmatizer")
    lemmatizer.min_tree_freq = 1
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))

    optimizer = nlp.initialize(get_examples=lambda: train_examples)

    for i in range(50):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
    assert losses["trainable_lemmatizer"] < 0.00001

    test_text = "She likes blue eggs"
    doc = nlp(test_text)
    assert doc[0].lemma_ == "she"
    assert doc[1].lemma_ == "like"
    assert doc[2].lemma_ == "blue"
    assert doc[3].lemma_ == "egg"

    # Check model after a {to,from}_disk roundtrip
    with util.make_tempdir() as tmp_dir:
        nlp.to_disk(tmp_dir)
        nlp2 = util.load_model_from_path(tmp_dir)
        doc2 = nlp2(test_text)
        assert doc2[0].lemma_ == "she"
        assert doc2[1].lemma_ == "like"
        assert doc2[2].lemma_ == "blue"
        assert doc2[3].lemma_ == "egg"

    # Check model after a {to,from}_bytes roundtrip
    nlp_bytes = nlp.to_bytes()
    nlp3 = English()
    nlp3.add_pipe("trainable_lemmatizer")
    nlp3.from_bytes(nlp_bytes)
    doc3 = nlp3(test_text)
    assert doc3[0].lemma_ == "she"
    assert doc3[1].lemma_ == "like"
    assert doc3[2].lemma_ == "blue"
    assert doc3[3].lemma_ == "egg"

    # Check model after a pickle roundtrip.
    nlp_bytes = pickle.dumps(nlp)
    nlp4 = pickle.loads(nlp_bytes)
    doc4 = nlp4(test_text)
    assert doc4[0].lemma_ == "she"
    assert doc4[1].lemma_ == "like"
    assert doc4[2].lemma_ == "blue"
    assert doc4[3].lemma_ == "egg"
Пример #9
0
def test_transformer_pipeline_tagger_internal():
    """Test that a tagger with internal transformer runs and trains properly"""
    orig_config = Config().from_str(cfg_string)
    nlp = util.load_model_from_config(orig_config,
                                      auto_fill=True,
                                      validate=True)
    assert nlp.pipe_names == ["tagger"]
    tagger = nlp.get_pipe("tagger")
    tagger_trf = tagger.model.get_ref("tok2vec").layers[0]
    assert isinstance(tagger_trf, Model)
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
        for tag in t[1]["tags"]:
            tagger.add_label(tag)

    optimizer = nlp.initialize(lambda: train_examples)
    for i in range(2):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)

    doc = nlp("We're interested at underwater basket weaving.")
    doc_tensor = tagger_trf.predict([doc])

    # ensure IO goes OK
    with make_tempdir() as d:
        file_path = d / "trained_nlp"
        nlp.to_disk(file_path)
        nlp2 = util.load_model_from_config(orig_config,
                                           auto_fill=True,
                                           validate=True)
        nlp2.initialize(lambda: train_examples)

        # results are not the same if we don't call from_disk
        doc2 = nlp2("We're interested at underwater basket weaving.")
        tagger2 = nlp2.get_pipe("tagger")
        tagger_trf2 = tagger2.model.get_ref("tok2vec").layers[0]
        doc_tensor2 = tagger_trf2.predict([doc2])
        with pytest.raises(AssertionError):
            assert_equal(doc_tensor2.doc_data[0].tensors,
                         doc_tensor.doc_data[0].tensors)

        # results ARE the same if we call from_disk
        nlp2.from_disk(file_path)
        doc2 = nlp2("We're interested at underwater basket weaving.")
        tagger2 = nlp2.get_pipe("tagger")
        tagger_trf2 = tagger2.model.get_ref("tok2vec").layers[0]
        doc_tensor2 = tagger_trf2.predict([doc2])
        assert_equal(doc_tensor2.doc_data[0].tensors,
                     doc_tensor.doc_data[0].tensors)
Пример #10
0
def test_initialized_inline_transformer_pipeline_todisk():
    orig_config = Config().from_str(inline_cfg_string)
    nlp = util.load_model_from_config(orig_config,
                                      auto_fill=True,
                                      validate=True)
    assert nlp.pipe_names == ["tagger"]
    tagger = nlp.get_pipe("tagger")
    tagger.add_label("V")
    nlp.initialize()
    with make_tempdir() as d:
        nlp.to_disk(d)
        nlp2 = spacy.load(d)
        assert nlp2.pipe_names == ["tagger"]
        tagger2 = nlp2.get_pipe("tagger")
        assert list(tagger2.labels) == ["V"]
Пример #11
0
def test_overfitting_IO():
    # Simple test to try and quickly overfit the spancat component - ensuring the ML models work correctly
    fix_random_seed(0)
    nlp = English()
    spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
    train_examples = make_examples(nlp)
    optimizer = nlp.initialize(get_examples=lambda: train_examples)
    assert spancat.model.get_dim("nO") == 2
    assert set(spancat.labels) == {"LOC", "PERSON"}

    for i in range(50):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
    assert losses["spancat"] < 0.01

    # test the trained model
    test_text = "I like London and Berlin"
    doc = nlp(test_text)
    assert doc.spans[spancat.key] == doc.spans[SPAN_KEY]
    spans = doc.spans[SPAN_KEY]
    assert len(spans) == 2
    assert len(spans.attrs["scores"]) == 2
    assert min(spans.attrs["scores"]) > 0.9
    assert set([span.text for span in spans]) == {"London", "Berlin"}
    assert set([span.label_ for span in spans]) == {"LOC"}

    # Also test the results are still the same after IO
    with make_tempdir() as tmp_dir:
        nlp.to_disk(tmp_dir)
        nlp2 = util.load_model_from_path(tmp_dir)
        doc2 = nlp2(test_text)
        spans2 = doc2.spans[SPAN_KEY]
        assert len(spans2) == 2
        assert len(spans2.attrs["scores"]) == 2
        assert min(spans2.attrs["scores"]) > 0.9
        assert set([span.text for span in spans2]) == {"London", "Berlin"}
        assert set([span.label_ for span in spans2]) == {"LOC"}

    # Test scoring
    scores = nlp.evaluate(train_examples)
    assert f"spans_{SPAN_KEY}_f" in scores
    assert scores[f"spans_{SPAN_KEY}_p"] == 1.0
    assert scores[f"spans_{SPAN_KEY}_r"] == 1.0
    assert scores[f"spans_{SPAN_KEY}_f"] == 1.0

    # also test that the spancat works for just a single entity in a sentence
    doc = nlp("London")
    assert len(doc.spans[spancat.key]) == 1
def test_transformer_sentencepiece_IO():
    """Test that a transformer using sentencepiece trains + IO goes OK"""
    orig_config = Config().from_str(cfg_string)
    orig_config["components"]["transformer"]["model"]["name"] = "camembert-base"
    nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
    tagger = nlp.get_pipe("tagger")
    tagger_trf = tagger.model.get_ref("tok2vec").layers[0]
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
        for tag in t[1]["tags"]:
            tagger.add_label(tag)

    optimizer = nlp.initialize(lambda: train_examples)
    for i in range(2):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)

    text = "We're interested at underwater basket weaving."
    doc = nlp(text)
    doc_tensor = tagger_trf.predict([doc])

    # ensure IO goes OK
    with make_tempdir() as d:
        file_path = d / "trained_nlp"
        nlp.to_disk(file_path)
        nlp2 = util.load_model_from_path(file_path)
        doc2 = nlp2(text)
        tagger2 = nlp2.get_pipe("tagger")
        tagger_trf2 = tagger2.model.get_ref("tok2vec").layers[0]
        doc_tensor2 = tagger_trf2.predict([doc2])
        _assert_equal_tensors(doc_tensor2[0].tensors, doc_tensor[0].tensors)

        # make sure that this can be saved to directory once more
        file_path_2 = d / "trained_nlp_2"
        nlp2.to_disk(file_path_2)

    # ensure to_bytes / from_bytes works
    nlp_bytes = nlp.to_bytes()
    nlp3 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
    nlp3.from_bytes(nlp_bytes)
    doc3 = nlp3(text)
    tagger3 = nlp3.get_pipe("tagger")
    tagger_trf3 = tagger3.model.get_ref("tok2vec").layers[0]
    doc_tensor3 = tagger_trf3.predict([doc3])
    _assert_equal_tensors(doc_tensor3[0].tensors, doc_tensor[0].tensors)
Пример #13
0
def test_overfitting_IO_overlapping():
    # Test for overfitting on overlapping entities
    fix_random_seed(0)
    nlp = English()
    spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})

    train_examples = make_examples(nlp, data=TRAIN_DATA_OVERLAPPING)
    optimizer = nlp.initialize(get_examples=lambda: train_examples)
    assert spancat.model.get_dim("nO") == 3
    assert set(spancat.labels) == {"PERSON", "LOC", "DOUBLE_LOC"}

    for i in range(50):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
    assert losses["spancat"] < 0.01

    # test the trained model
    test_text = "I like London and Berlin"
    doc = nlp(test_text)
    spans = doc.spans[SPAN_KEY]
    assert len(spans) == 3
    assert len(spans.attrs["scores"]) == 3
    assert min(spans.attrs["scores"]) > 0.9
    assert set([span.text for span in spans]) == {
        "London",
        "Berlin",
        "London and Berlin",
    }
    assert set([span.label_ for span in spans]) == {"LOC", "DOUBLE_LOC"}

    # Also test the results are still the same after IO
    with make_tempdir() as tmp_dir:
        nlp.to_disk(tmp_dir)
        nlp2 = util.load_model_from_path(tmp_dir)
        doc2 = nlp2(test_text)
        spans2 = doc2.spans[SPAN_KEY]
        assert len(spans2) == 3
        assert len(spans2.attrs["scores"]) == 3
        assert min(spans2.attrs["scores"]) > 0.9
        assert set([span.text for span in spans2]) == {
            "London",
            "Berlin",
            "London and Berlin",
        }
        assert set([span.label_ for span in spans2]) == {"LOC", "DOUBLE_LOC"}
Пример #14
0
def test_from_to_disk():
    strings = StringStore()
    trees = EditTrees(strings)
    trees.add("deelt", "delen")
    trees.add("gedeeld", "delen")

    trees2 = EditTrees(strings)
    with make_tempdir() as temp_dir:
        trees_file = temp_dir / "edit_trees.bin"
        trees.to_disk(trees_file)
        trees2 = trees2.from_disk(trees_file)

    # Verify that the nodes did not change.
    assert len(trees) == len(trees2)
    for i in range(len(trees)):
        assert trees.tree_to_str(i) == trees2.tree_to_str(i)

    # Reinserting the same trees should not add new nodes.
    trees2.add("deelt", "delen")
    trees2.add("gedeeld", "delen")
    assert len(trees) == len(trees2)
Пример #15
0
def test_load_disable_enable() -> None:
    """
    Tests spacy.load() with dis-/enabling components.
    """

    base_nlp = English()
    for pipe in ("sentencizer", "tagger", "parser"):
        base_nlp.add_pipe(pipe)

    with make_tempdir() as tmp_dir:
        base_nlp.to_disk(tmp_dir)
        to_disable = ["parser", "tagger"]
        to_enable = ["tagger", "parser"]

        # Setting only `disable`.
        nlp = spacy.load(tmp_dir, disable=to_disable)
        assert all([comp_name in nlp.disabled for comp_name in to_disable])

        # Setting only `enable`.
        nlp = spacy.load(tmp_dir, enable=to_enable)
        assert all([(comp_name in nlp.disabled) is (comp_name not in to_enable)
                    for comp_name in nlp.component_names])

        # Testing consistent enable/disable combination.
        nlp = spacy.load(
            tmp_dir,
            enable=to_enable,
            disable=[
                comp_name for comp_name in nlp.component_names
                if comp_name not in to_enable
            ],
        )
        assert all([(comp_name in nlp.disabled) is (comp_name not in to_enable)
                    for comp_name in nlp.component_names])

        # Inconsistent enable/disable combination.
        with pytest.raises(ValueError):
            spacy.load(tmp_dir, enable=to_enable, disable=["parser"])
def test_replace_listeners():
    orig_config = Config().from_str(cfg_string)
    nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
    text = "This is awesome"
    examples = [Example.from_dict(nlp.make_doc(text), {"tags": ["A", "B", "C"]})]
    optimizer = nlp.initialize(lambda: examples)
    # verify correct configuration with transformer listener
    transformer = nlp.get_pipe("transformer")
    tagger = nlp.get_pipe("tagger")
    tagger_tok2vec = tagger.model.get_ref("tok2vec")
    tagger_listener = tagger_tok2vec.get_ref("listener")
    assert isinstance(tagger_listener, TransformerListener)
    assert transformer.listener_map["tagger"][0] == tagger_listener
    assert isinstance(transformer.model, TransformerModel)
    assert (
        nlp.config["components"]["transformer"]["model"]["@architectures"]
        == "spacy-transformers.TransformerModel.v3"
    )
    assert (
        nlp.config["components"]["tagger"]["model"]["tok2vec"]["@architectures"]
        == "spacy-transformers.TransformerListener.v1"
    )
    # train pipe before replacing listeners
    for i in range(2):
        losses = {}
        nlp.update(examples, sgd=optimizer, losses=losses)
        doc = nlp(text)

    preds = [t.tag_ for t in doc]
    doc_tensor = tagger_tok2vec.predict([doc])

    # replace listener and verify predictions are still the same
    nlp.replace_listeners("transformer", "tagger", ["model.tok2vec"])
    tagger = nlp.get_pipe("tagger")
    tagger_tok2vec = tagger.model.get_ref("tok2vec")
    assert isinstance(tagger_tok2vec, Model)
    assert tagger_tok2vec.layers[0].layers[0].name == "transformer"
    assert (
        nlp.config["components"]["tagger"]["model"]["tok2vec"]["@architectures"]
        == "spacy-transformers.Tok2VecTransformer.v3"
    )
    doc2 = nlp(text)
    assert preds == [t.tag_ for t in doc2]
    pred_tensor = tagger_tok2vec.predict([doc2])
    _assert_equal_tensors(doc_tensor, pred_tensor)

    # attempt training with the new pipeline
    optimizer = nlp.resume_training()
    for i in range(2):
        losses = {}
        nlp.update(examples, sgd=optimizer, losses=losses)
        assert losses["tagger"] > 0.0

    # check for presence of additional fields in model_output
    assert doc2._.trf_data.model_output.pooler_output is not None
    assert doc2._.trf_data.model_output.attentions is not None

    # ensure IO goes OK
    doc_tensor_trained = tagger_tok2vec.predict([doc])
    with make_tempdir() as d:
        file_path = d / "trained_nlp"
        nlp.to_disk(file_path)
        nlp2 = util.load_model_from_path(file_path)
        doc3 = nlp2(text)
        tagger2 = nlp2.get_pipe("tagger")
        tagger_tok2vec2 = tagger2.model.get_ref("tok2vec")
        pred_tensor = tagger_tok2vec2.predict([doc3])
        _assert_equal_tensors(doc_tensor_trained, pred_tensor)