示例#1
0
def test_Example_aligned_whitespace(en_vocab):
    words = ["a", " ", "b"]
    tags = ["A", "SPACE", "B"]
    predicted = Doc(en_vocab, words=words)
    reference = Doc(en_vocab, words=words, tags=tags)

    example = Example(predicted, reference)
    assert example.get_aligned("TAG", as_string=True) == tags
示例#2
0
def test_Example_init_requires_doc_objects():
    vocab = Vocab()
    with pytest.raises(TypeError):
        Example(None, None)
    with pytest.raises(TypeError):
        Example(Doc(vocab, words=["hi"]), None)
    with pytest.raises(TypeError):
        Example(None, Doc(vocab, words=["hi"]))
示例#3
0
def test_aligned_tags():
    pred_words = ["Apply", "some", "sunscreen", "unless", "you", "can", "not"]
    gold_words = ["Apply", "some", "sun", "screen", "unless", "you", "cannot"]
    gold_tags = ["VERB", "DET", "NOUN", "NOUN", "SCONJ", "PRON", "VERB"]
    annots = {"words": gold_words, "tags": gold_tags}
    vocab = Vocab()
    predicted = Doc(vocab, words=pred_words)
    example1 = Example.from_dict(predicted, annots)
    aligned_tags1 = example1.get_aligned("TAG", as_string=True)
    assert aligned_tags1 == ["VERB", "DET", "NOUN", "SCONJ", "PRON", "VERB", "VERB"]
    # ensure that to_dict works correctly
    example2 = Example.from_dict(predicted, example1.to_dict())
    aligned_tags2 = example2.get_aligned("TAG", as_string=True)
    assert aligned_tags2 == ["VERB", "DET", "NOUN", "SCONJ", "PRON", "VERB", "VERB"]
示例#4
0
def test_Example_from_dict_with_sent_start(annots):
    vocab = Vocab()
    predicted = Doc(vocab, words=annots["words"])
    example = Example.from_dict(predicted, annots)
    assert len(list(example.reference.sents)) == 2
    for i, token in enumerate(example.reference):
        assert bool(token.is_sent_start) == bool(annots["sent_starts"][i])
def test_transformer_pipeline_textcat():
    """Test that a pipeline with just a transformer+textcat runs and trains properly.
    This used to throw an error because of shape inference issues -
    cf https://github.com/explosion/spaCy/issues/6401"""
    orig_config = Config().from_str(cfg_string)
    nlp = util.load_model_from_config(orig_config,
                                      auto_fill=True,
                                      validate=True)
    assert nlp.pipe_names == ["transformer", "textcat"]
    train_examples = []

    for text, annotations in TRAIN_DATA:
        train_examples.append(
            Example.from_dict(nlp.make_doc(text), annotations))
    optimizer = nlp.initialize(get_examples=lambda: train_examples)

    for i in range(2):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)

    doc = nlp("We're interested at underwater basket weaving.")
    cats1 = doc.cats

    # ensure IO goes OK
    with make_tempdir() as d:
        file_path = d / "trained_nlp"
        nlp.to_disk(file_path)
        nlp2 = spacy.load(file_path)
        doc2 = nlp2("We're interested at underwater basket weaving.")
        cats2 = doc2.cats
        assert cats1 == cats2
示例#6
0
def train_spacy(data, epochs, batch_size=8):
    TRAINING_DATA = data
    nlp = spacy.blank('en')
    if 'ner' not in nlp.pipe_names:
        ner = nlp.add_pipe('ner', last=True)
    for _, annotations in TRAINING_DATA:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()
        for epoch in range(epochs):
            print('Starting epoch: '+ str(epoch))
            random.shuffle(TRAINING_DATA)
            losses = {}
            for batch in minibatch(TRAINING_DATA, size=batch_size):
                for text, annotations in batch:
                    doc = nlp.make_doc(text)
                    example = Example.from_dict(doc, annotations)
                    nlp.update(
                        [example],
                        drop=0.2,  # Prevent overfitting
                        sgd=optimizer,
                        losses=losses,
                        )
            print(losses)
    return nlp
示例#7
0
def train_spacy(data, iterations):
    nlp = spacy.blank('de')  # create blank Language class

    # create the built-in pipeline components and add them to the pipeline
    if 'ner' not in nlp.pipe_names:
        nlp.add_pipe("ner", last=True)

    # get names of other pipes to disable them during training, actually not necessary since language class is blank
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(iterations):
            print("Starting iteration " + str(itn))
            random.shuffle(data)
            losses = {}
            for texts, annotations in data:
                examples = [
                    Example.from_dict(nlp.make_doc(texts), annotations)
                    for texts, annotations in data
                ]
                nlp.update(
                    examples,
                    drop=0.5,  # dropout - make it harder to memorise data
                    sgd=optimizer,
                    losses=losses)
                print("Losses", losses)
    return nlp
示例#8
0
def test_Example_from_dict_with_parse(annots):
    vocab = Vocab()
    predicted = Doc(vocab, words=annots["words"])
    example = Example.from_dict(predicted, annots)
    for i, token in enumerate(example.reference):
        assert token.dep_ == annots["deps"][i]
        assert token.head.i == annots["heads"][i]
示例#9
0
def test_Example_from_dict_sentences():
    vocab = Vocab()
    predicted = Doc(vocab, words=["One", "sentence", ".", "one", "more"])
    annots = {"sent_starts": [1, 0, 0, 1, 0]}
    ex = Example.from_dict(predicted, annots)
    assert len(list(ex.reference.sents)) == 2

    # this currently throws an error - bug or feature?
    # predicted = Doc(vocab, words=["One", "sentence", "not", "one", "more"])
    # annots = {"sent_starts": [1, 0, 0, 0, 0]}
    # ex = Example.from_dict(predicted, annots)
    # assert len(list(ex.reference.sents)) == 1

    predicted = Doc(vocab, words=["One", "sentence", "not", "one", "more"])
    annots = {"sent_starts": [1, -1, 0, 0, 0]}
    ex = Example.from_dict(predicted, annots)
    assert len(list(ex.reference.sents)) == 1
示例#10
0
def test_Example_missing_deps():
    vocab = Vocab()
    words = ["I", "like", "London", "and", "Berlin", "."]
    deps = ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"]
    heads = [1, 1, 1, 2, 2, 1]
    annots_head_only = {"words": words, "heads": heads}
    annots_head_dep = {"words": words, "heads": heads, "deps": deps}
    predicted = Doc(vocab, words=words)

    # when not providing deps, the head information is considered to be missing
    # in this case, the token's heads refer to themselves
    example_1 = Example.from_dict(predicted, annots_head_only)
    assert [t.head.i for t in example_1.reference] == [0, 1, 2, 3, 4, 5]

    # when providing deps, the head information is actually used
    example_2 = Example.from_dict(predicted, annots_head_dep)
    assert [t.head.i for t in example_2.reference] == heads
示例#11
0
def test_Example_from_dict_with_tags(pred_words, annots):
    vocab = Vocab()
    predicted = Doc(vocab, words=pred_words)
    example = Example.from_dict(predicted, annots)
    for i, token in enumerate(example.reference):
        assert token.tag_ == annots["tags"][i]
    aligned_tags = example.get_aligned("TAG", as_string=True)
    assert aligned_tags == ["NN" for _ in predicted]
示例#12
0
def test_Example_from_dict_with_cats(annots):
    vocab = Vocab()
    predicted = Doc(vocab, words=annots["words"])
    example = Example.from_dict(predicted, annots)
    assert len(list(example.reference.cats)) == 3
    assert example.reference.cats["cat1"] == 1.0
    assert example.reference.cats["cat2"] == 0.0
    assert example.reference.cats["cat3"] == 0.5
def test_transformer_pipeline_tagger_senter_listener():
    """Test that a pipeline with just a transformer+tagger+senter runs and
    trains properly"""
    orig_config = Config().from_str(cfg_string)
    nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
    assert nlp.pipe_names == ["transformer", "tagger", "senter"]
    tagger = nlp.get_pipe("tagger")
    transformer = nlp.get_pipe("transformer")
    tagger_trf = tagger.model.get_ref("tok2vec").layers[0]
    assert isinstance(transformer, Transformer)
    assert isinstance(tagger_trf, TransformerListener)
    assert tagger_trf.upstream_name == "custom_upstream"
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
        for tag in t[1]["tags"]:
            tagger.add_label(tag)

    # Check that the Transformer component finds it listeners
    assert transformer.listeners == []
    optimizer = nlp.initialize(lambda: train_examples)
    assert tagger_trf in transformer.listeners

    for i in range(2):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)

    text = "We're interested at underwater basket weaving."
    doc = nlp(text)
    doc_tensor = tagger_trf.predict([doc])
    _assert_equal_tensors(doc._.trf_data.tensors, doc_tensor[0].tensors)

    # ensure IO goes OK
    with make_tempdir() as d:
        file_path = d / "trained_nlp"
        nlp.to_disk(file_path)
        nlp2 = util.load_model_from_path(file_path)
        doc2 = nlp2(text)
        tagger2 = nlp2.get_pipe("tagger")
        tagger_trf2 = tagger2.model.get_ref("tok2vec").layers[0]
        doc_tensor2 = tagger_trf2.predict([doc2])
        _assert_equal_tensors(doc_tensor2[0].tensors, doc_tensor[0].tensors)

        # make sure that this can be saved to directory once more
        file_path_2 = d / "trained_nlp_2"
        nlp2.to_disk(file_path_2)

    # ensure to_bytes / from_bytes works
    nlp_bytes = nlp.to_bytes()
    nlp3 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
    nlp3.from_bytes(nlp_bytes)
    doc3 = nlp3(text)
    tagger3 = nlp3.get_pipe("tagger")
    tagger_trf3 = tagger3.model.get_ref("tok2vec").layers[0]
    doc_tensor3 = tagger_trf3.predict([doc3])
    _assert_equal_tensors(doc_tensor3[0].tensors, doc_tensor[0].tensors)
示例#14
0
def test_aligned_tags_multi():
    pred_words = ["Applysome", "sunscreen", "unless", "you", "can", "not"]
    gold_words = ["Apply", "somesun", "screen", "unless", "you", "cannot"]
    gold_tags = ["VERB", "DET", "NOUN", "SCONJ", "PRON", "VERB"]
    annots = {"words": gold_words, "tags": gold_tags}
    vocab = Vocab()
    predicted = Doc(vocab, words=pred_words)
    example = Example.from_dict(predicted, annots)
    aligned_tags = example.get_aligned("TAG", as_string=True)
    assert aligned_tags == [None, None, "SCONJ", "PRON", "VERB", "VERB"]
示例#15
0
def test_Example_from_dict_with_empty_entities():
    annots = {
        "words": ["I", "like", "New", "York", "and", "Berlin", "."],
        "entities": [],
    }
    vocab = Vocab()
    predicted = Doc(vocab, words=annots["words"])
    example = Example.from_dict(predicted, annots)
    # entities as empty list sets everything to O
    assert example.reference.has_annotation("ENT_IOB")
    assert len(list(example.reference.ents)) == 0
    assert all(token.ent_iob_ == "O" for token in example.reference)
    # various unset/missing entities leaves entities unset
    annots["entities"] = None
    example = Example.from_dict(predicted, annots)
    assert not example.reference.has_annotation("ENT_IOB")
    annots.pop("entities", None)
    example = Example.from_dict(predicted, annots)
    assert not example.reference.has_annotation("ENT_IOB")
示例#16
0
def test_Example_from_dict_with_spans_overlapping(annots):
    vocab = Vocab()
    predicted = Doc(vocab, words=annots["words"])
    example = Example.from_dict(predicted, annots)
    assert len(list(example.reference.ents)) == 0
    assert len(list(example.reference.spans["cities"])) == 3
    assert len(list(example.reference.spans["people"])) == 1
    for span in example.reference.spans["cities"]:
        assert span.label_ == "LOC"
    for span in example.reference.spans["people"]:
        assert span.label_ == "PERSON"
示例#17
0
def test_Example_from_dict_with_links(annots):
    vocab = Vocab()
    predicted = Doc(vocab, words=annots["words"])
    example = Example.from_dict(predicted, annots)
    assert example.reference[0].ent_kb_id_ == ""
    assert example.reference[1].ent_kb_id_ == ""
    assert example.reference[2].ent_kb_id_ == "Q60"
    assert example.reference[3].ent_kb_id_ == "Q60"
    assert example.reference[4].ent_kb_id_ == ""
    assert example.reference[5].ent_kb_id_ == "Q64"
    assert example.reference[6].ent_kb_id_ == ""
示例#18
0
文件: ex2.py 项目: Kaggle/learntools
def train_func(model, train_data, optimizer, batch_size=8):
    losses = {}
    random.seed(1)
    random.shuffle(train_data)
    for batch in minibatch(train_data, size=batch_size):
        for text, labels in batch:
            doc = model.make_doc(text)
            example = Example.from_dict(doc, labels)
            # Update model with texts and labels
            model.update([example], sgd=optimizer, losses=losses)

    return losses
示例#19
0
def test_Example_from_dict_with_sent_start(annots):
    vocab = Vocab()
    predicted = Doc(vocab, words=annots["words"])
    example = Example.from_dict(predicted, annots)
    assert len(list(example.reference.sents)) == 2
    for i, token in enumerate(example.reference):
        if to_ternary_int(annots["sent_starts"][i]) == 1:
            assert token.is_sent_start is True
        elif to_ternary_int(annots["sent_starts"][i]) == 0:
            assert token.is_sent_start is None
        else:
            assert token.is_sent_start is False
示例#20
0
def test_Example_from_dict_with_entities(annots):
    vocab = Vocab()
    predicted = Doc(vocab, words=annots["words"])
    example = Example.from_dict(predicted, annots)
    assert len(list(example.reference.ents)) == 2
    # fmt: off
    assert [example.reference[i].ent_iob_ for i in range(7)] == ["O", "O", "B", "I", "O", "B", "O"]
    assert example.get_aligned("ENT_IOB") == [2, 2, 3, 1, 2, 3, 2]
    # fmt: on
    assert example.reference[2].ent_type_ == "LOC"
    assert example.reference[3].ent_type_ == "LOC"
    assert example.reference[5].ent_type_ == "LOC"
def test_transformer_pipeline_empty():
    """Test that the pipeline doesn't fail with empty input"""
    orig_config = Config().from_str(cfg_string)
    nlp = util.load_model_from_config(orig_config,
                                      auto_fill=True,
                                      validate=True)
    tagger = nlp.get_pipe("tagger")
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
        for tag in t[1]["tags"]:
            tagger.add_label(tag)

    # train on empty doc
    optimizer = nlp.initialize()
    losses = {}
    empty_train_example = Example.from_dict(nlp.make_doc(""), {})
    nlp.update(train_examples, sgd=optimizer, losses=losses)
    nlp.update([empty_train_example], sgd=optimizer, losses=losses)
    train_examples.append(empty_train_example)
    nlp.update(train_examples, sgd=optimizer, losses=losses)

    # predict empty doc
    doc = nlp("")
    _assert_empty(doc._.trf_data)
    docs = nlp.pipe(["", ""])
    for doc in docs:
        _assert_empty(doc._.trf_data)
    nlp.pipe([])

    # predict combination of empty and non-empty
    doc = nlp("This is a sentence")
    normal_tags = [t.tag_ for t in doc]

    docs = list(nlp.pipe(["", "This is a sentence", "", ""]))
    _assert_empty(docs[0]._.trf_data)
    assert [t.tag_ for t in docs[0]] == []
    assert [t.tag_ for t in docs[1]] == normal_tags
    _assert_empty(docs[2]._.trf_data)
    _assert_empty(docs[3]._.trf_data)
示例#22
0
def test_replace_listeners():
    orig_config = Config().from_str(cfg_string)
    nlp = util.load_model_from_config(orig_config,
                                      auto_fill=True,
                                      validate=True)
    text = "This is awesome"
    examples = [
        Example.from_dict(nlp.make_doc(text), {"tags": ["A", "B", "C"]})
    ]
    optimizer = nlp.initialize(lambda: examples)
    # verify correct configuration with transformer listener
    transformer = nlp.get_pipe("transformer")
    tagger = nlp.get_pipe("tagger")
    tagger_tok2vec = tagger.model.get_ref("tok2vec")
    tagger_listener = tagger_tok2vec.get_ref("listener")
    assert isinstance(tagger_listener, TransformerListener)
    assert transformer.listener_map["tagger"][0] == tagger_listener
    assert isinstance(transformer.model, TransformerModel)
    assert (nlp.config["components"]["transformer"]["model"]["@architectures"]
            == "spacy-transformers.TransformerModel.v2")
    assert (nlp.config["components"]["tagger"]["model"]["tok2vec"]
            ["@architectures"] == "spacy-transformers.TransformerListener.v1")
    # train pipe before replacing listeners
    for i in range(2):
        losses = {}
        nlp.update(examples, sgd=optimizer, losses=losses)
        doc = nlp(text)

    preds = [t.tag_ for t in doc]
    doc_tensor = tagger_tok2vec.predict([doc])

    # replace listener and verify predictions are still the same
    nlp.replace_listeners("transformer", "tagger", ["model.tok2vec"])
    tagger = nlp.get_pipe("tagger")
    tagger_tok2vec = tagger.model.get_ref("tok2vec")
    assert isinstance(tagger_tok2vec, Model)
    assert tagger_tok2vec.layers[0].layers[0].name == "transformer"
    assert (nlp.config["components"]["tagger"]["model"]["tok2vec"]
            ["@architectures"] == "spacy-transformers.Tok2VecTransformer.v2")
    doc2 = nlp(text)
    assert preds == [t.tag_ for t in doc2]
    assert_equal(doc_tensor, tagger_tok2vec.predict([doc2]))
    # attempt training with the new pipeline
    optimizer = nlp.resume_training()
    for i in range(2):
        losses = {}
        nlp.update(examples, sgd=optimizer, losses=losses)
        assert losses["tagger"] > 0.0

    # check for presence of additional fields in model_output
    assert doc2._.trf_data.model_output.pooler_output is not None
    assert doc2._.trf_data.model_output.attentions is not None
示例#23
0
def train(model, train_data, optimizer, batch_size=8):
    losses = {}
    random.seed(1)
    random.shuffle(train_data)
    # train_data is a list of tuples [(text0, label0), (text1, label1), ...]
    for batch in minibatch(train_data, size=batch_size):
        # split batch into text and labels
        for text, labels in batch:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, labels)
            # Update model with texts and labels
            model.update([example], sgd=optimizer, losses=losses)
    return losses
def simple_nlp():
    nlp = Language()
    nlp.add_pipe("transformer")
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))

    optimizer = nlp.initialize()
    for i in range(2):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)

    return nlp
示例#25
0
def test_transformer_pipeline_tagger_internal():
    """Test that a tagger with internal transformer runs and trains properly"""
    orig_config = Config().from_str(cfg_string)
    nlp = util.load_model_from_config(orig_config,
                                      auto_fill=True,
                                      validate=True)
    assert nlp.pipe_names == ["tagger"]
    tagger = nlp.get_pipe("tagger")
    tagger_trf = tagger.model.get_ref("tok2vec").layers[0]
    assert isinstance(tagger_trf, Model)
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
        for tag in t[1]["tags"]:
            tagger.add_label(tag)

    optimizer = nlp.initialize(lambda: train_examples)
    for i in range(2):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)

    doc = nlp("We're interested at underwater basket weaving.")
    doc_tensor = tagger_trf.predict([doc])

    # ensure IO goes OK
    with make_tempdir() as d:
        file_path = d / "trained_nlp"
        nlp.to_disk(file_path)
        nlp2 = util.load_model_from_config(orig_config,
                                           auto_fill=True,
                                           validate=True)
        nlp2.initialize(lambda: train_examples)

        # results are not the same if we don't call from_disk
        doc2 = nlp2("We're interested at underwater basket weaving.")
        tagger2 = nlp2.get_pipe("tagger")
        tagger_trf2 = tagger2.model.get_ref("tok2vec").layers[0]
        doc_tensor2 = tagger_trf2.predict([doc2])
        with pytest.raises(AssertionError):
            assert_equal(doc_tensor2.doc_data[0].tensors,
                         doc_tensor.doc_data[0].tensors)

        # results ARE the same if we call from_disk
        nlp2.from_disk(file_path)
        doc2 = nlp2("We're interested at underwater basket weaving.")
        tagger2 = nlp2.get_pipe("tagger")
        tagger_trf2 = tagger2.model.get_ref("tok2vec").layers[0]
        doc_tensor2 = tagger_trf2.predict([doc2])
        assert_equal(doc_tensor2.doc_data[0].tensors,
                     doc_tensor.doc_data[0].tensors)
def test_transformer_pipeline_simple():
    """Test that a simple pipeline with just a transformer at least runs"""
    nlp = Language()
    nlp.add_pipe("transformer")
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))

    optimizer = nlp.initialize()
    for i in range(2):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
    doc = nlp("We're interested at underwater basket weaving.")
    assert doc
def test_replace_listeners_invalid():
    orig_config = Config().from_str(cfg_string)
    nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
    text = "This is awesome"
    examples = [Example.from_dict(nlp.make_doc(text), {"tags": ["A", "B", "C"]})]
    optimizer = nlp.initialize(lambda: examples)
    for i in range(2):
        losses = {}
        nlp.update(examples, sgd=optimizer, losses=losses)
    with pytest.raises(ValueError):
        nlp.replace_listeners("invalid", "tagger", ["model.tok2vec"])
    with pytest.raises(ValueError):
        nlp.replace_listeners("transformer", "parser", ["model.tok2vec"])
    with pytest.raises(ValueError):
        nlp.replace_listeners("transformer", "tagger", ["model.yolo"])
    with pytest.raises(ValueError):
        nlp.replace_listeners("transformer", "tagger", ["model.tok2vec", "model.yolo"])
def test_transformer_sentencepiece_IO():
    """Test that a transformer using sentencepiece trains + IO goes OK"""
    orig_config = Config().from_str(cfg_string)
    orig_config["components"]["transformer"]["model"]["name"] = "camembert-base"
    nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
    tagger = nlp.get_pipe("tagger")
    tagger_trf = tagger.model.get_ref("tok2vec").layers[0]
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
        for tag in t[1]["tags"]:
            tagger.add_label(tag)

    optimizer = nlp.initialize(lambda: train_examples)
    for i in range(2):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)

    text = "We're interested at underwater basket weaving."
    doc = nlp(text)
    doc_tensor = tagger_trf.predict([doc])

    # ensure IO goes OK
    with make_tempdir() as d:
        file_path = d / "trained_nlp"
        nlp.to_disk(file_path)
        nlp2 = util.load_model_from_path(file_path)
        doc2 = nlp2(text)
        tagger2 = nlp2.get_pipe("tagger")
        tagger_trf2 = tagger2.model.get_ref("tok2vec").layers[0]
        doc_tensor2 = tagger_trf2.predict([doc2])
        _assert_equal_tensors(doc_tensor2[0].tensors, doc_tensor[0].tensors)

        # make sure that this can be saved to directory once more
        file_path_2 = d / "trained_nlp_2"
        nlp2.to_disk(file_path_2)

    # ensure to_bytes / from_bytes works
    nlp_bytes = nlp.to_bytes()
    nlp3 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
    nlp3.from_bytes(nlp_bytes)
    doc3 = nlp3(text)
    tagger3 = nlp3.get_pipe("tagger")
    tagger_trf3 = tagger3.model.get_ref("tok2vec").layers[0]
    doc_tensor3 = tagger_trf3.predict([doc3])
    _assert_equal_tensors(doc_tensor3[0].tensors, doc_tensor[0].tensors)
示例#29
0
def test_Example_missing_heads():
    vocab = Vocab()
    words = ["I", "like", "London", "and", "Berlin", "."]
    deps = ["nsubj", "ROOT", "dobj", None, "conj", "punct"]
    heads = [1, 1, 1, None, 2, 1]
    annots = {"words": words, "heads": heads, "deps": deps}
    predicted = Doc(vocab, words=words)

    example = Example.from_dict(predicted, annots)
    parsed_heads = [t.head.i for t in example.reference]
    assert parsed_heads[0] == heads[0]
    assert parsed_heads[1] == heads[1]
    assert parsed_heads[2] == heads[2]
    assert parsed_heads[4] == heads[4]
    assert parsed_heads[5] == heads[5]
    expected = [True, True, True, False, True, True]
    assert [t.has_head() for t in example.reference] == expected
    # Ensure that the missing head doesn't create an artificial new sentence start
    expected = [True, False, False, False, False, False]
    assert example.get_aligned_sent_starts() == expected
示例#30
0
def train_spacy(TRAINING_DATA, epochs):
    # nlp = spacy.blank('en')
    nlp = spacy.load('./holocaust_model')
    ner = nlp.add_pipe('ner', name='ner_conc_camp')
    ner.add_label('CAMP')
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner_conc_camp']
    with nlp.disable_pipes(
            *other_pipes):  # disable other pipes to not affect them
        optimizer = nlp.begin_training()
        for epoch in range(epochs):
            print(f'Starting epoch: {epoch}')
            random.shuffle(TRAINING_DATA)
            losses = {}
            for batch in minibatch(TRAINING_DATA, size=8):
                for text, annotations in TRAINING_DATA:
                    doc = nlp.make_doc(text)
                    example = Example.from_dict(doc, annotations)
                    nlp.update([example],
                               drop=0.2,
                               sgd=optimizer,
                               losses=losses)
            print(losses)
    return nlp