def load_spacy_model(cache_dir=DEFAULT_CACHE_DIR, verbose=False, textcat=None, vectorError=False): """ Loads a spacy model. OBS vectorError is a TEMP ugly work around error encounted by keeping two models an not been able to find referece name for vectros """ from spacy.util import load_model_from_path if textcat == None or vectorError == True: modelname = 'spacy' model_weight_path = download_model(modelname, cache_dir, process_func=_unzip_process_func, verbose=verbose) nlp = load_model_from_path(model_weight_path) if textcat == 'sentiment': modelname = 'spacy.sentiment' model_weight_path = download_model(modelname, cache_dir, process_func=_unzip_process_func, verbose=verbose) # quick fix from not aligned models storage: import os model_weight_path = os.path.join(model_weight_path, 'spacy.sentiment') nlp = load_model_from_path(model_weight_path) return nlp
def test_lemmatizer_serialize(nlp): lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "rule"}) nlp.initialize() def cope_lookups(): lookups = Lookups() lookups.add_table("lemma_lookup", {"cope": "cope", "coped": "cope"}) lookups.add_table("lemma_index", {"verb": ("cope", "cop")}) lookups.add_table("lemma_exc", {"verb": {"coping": ("cope", )}}) lookups.add_table("lemma_rules", {"verb": [["ing", ""]]}) return lookups nlp2 = English() lemmatizer2 = nlp2.add_pipe("lemmatizer", config={"mode": "rule"}) lemmatizer2.initialize(lookups=cope_lookups()) lemmatizer2.from_bytes(lemmatizer.to_bytes()) assert lemmatizer.to_bytes() == lemmatizer2.to_bytes() assert lemmatizer.lookups.tables == lemmatizer2.lookups.tables # Also test the results are still the same after IO with make_tempdir() as tmp_dir: nlp.to_disk(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir) doc2 = nlp2.make_doc("coping") doc2[0].pos_ = "VERB" assert doc2[0].lemma_ == "" doc2 = lemmatizer2(doc2) assert doc2[0].text == "coping" assert doc2[0].lemma_ == "cope" # Make sure that lemmatizer cache can be pickled b = pickle.dumps(lemmatizer2)
def test_transformer_pipeline_textcat(): """Test that a pipeline with just a transformer+textcat runs and trains properly. This used to throw an error because of shape inference issues - cf https://github.com/explosion/spaCy/issues/6401""" orig_config = Config().from_str(cfg_string) nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) assert nlp.pipe_names == ["transformer", "textcat"] train_examples = [] for text, annotations in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) optimizer = nlp.initialize(get_examples=lambda: train_examples) for i in range(2): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) doc = nlp("We're interested at underwater basket weaving.") cats1 = doc.cats # ensure IO goes OK with make_tempdir() as d: file_path = d / "trained_nlp" nlp.to_disk(file_path) nlp2 = util.load_model_from_path(file_path) doc2 = nlp2("We're interested at underwater basket weaving.") cats2 = doc2.cats assert cats1 == cats2
def generate_meta(model_path, existing_meta): meta = existing_meta or {} settings = [('lang', 'Model language', meta.get('lang', 'en')), ('name', 'Model name', meta.get('name', 'model')), ('version', 'Model version', meta.get('version', '0.0.0')), ('spacy_version', 'Required spaCy version', '>=%s,<3.0.0' % about.__version__), ('description', 'Model description', meta.get('description', False)), ('author', 'Author', meta.get('author', False)), ('email', 'Author email', meta.get('email', False)), ('url', 'Author website', meta.get('url', False)), ('license', 'License', meta.get('license', 'CC BY-SA 3.0'))] nlp = util.load_model_from_path(Path(model_path)) meta['pipeline'] = nlp.pipe_names meta['vectors'] = { 'width': nlp.vocab.vectors_length, 'vectors': len(nlp.vocab.vectors), 'keys': nlp.vocab.vectors.n_keys } prints(Messages.M047, title=Messages.M046) for setting, desc, default in settings: response = util.get_raw_input(desc, default) meta[setting] = default if response == '' and default else response if about.__title__ != 'spacy': meta['parent_package'] = about.__title__ return meta
def test_overfitting_IO(): # Simple test to try and quickly overfit the tagger - ensuring the ML models work correctly nlp = English() tagger = nlp.add_pipe("tagger") train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) optimizer = nlp.initialize(get_examples=lambda: train_examples) assert tagger.model.get_dim("nO") == len(TAGS) for i in range(50): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses["tagger"] < 0.00001 # test the trained model test_text = "I like blue eggs" doc = nlp(test_text) assert doc[0].tag_ == "N" assert doc[1].tag_ == "V" assert doc[2].tag_ == "J" assert doc[3].tag_ == "N" # Also test the results are still the same after IO with make_tempdir() as tmp_dir: nlp.to_disk(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir) doc2 = nlp2(test_text) assert doc2[0].tag_ == "N" assert doc2[1].tag_ == "V" assert doc2[2].tag_ == "J" assert doc2[3].tag_ == "N" # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions texts = [ "Just a sentence.", "I like green eggs.", "Here is another one.", "I eat ham.", ] batch_deps_1 = [doc.to_array([TAG]) for doc in nlp.pipe(texts)] batch_deps_2 = [doc.to_array([TAG]) for doc in nlp.pipe(texts)] no_batch_deps = [ doc.to_array([TAG]) for doc in [nlp(text) for text in texts] ] assert_equal(batch_deps_1, batch_deps_2) assert_equal(batch_deps_1, no_batch_deps) # Try to unlearn the first 'N' tag with negative annotation neg_ex = Example.from_dict(nlp.make_doc(test_text), {"tags": ["!N", "V", "J", "N"]}) for i in range(20): losses = {} nlp.update([neg_ex], sgd=optimizer, losses=losses) # test the "untrained" tag doc3 = nlp(test_text) assert doc3[0].tag_ != "N"
def test_overfitting_IO(use_upper): # Simple test to try and quickly overfit the NER component nlp = English() ner = nlp.add_pipe("ner", config={"model": {"use_upper": use_upper}}) train_examples = [] for text, annotations in TRAIN_DATA: train_examples.append( Example.from_dict(nlp.make_doc(text), annotations)) for ent in annotations.get("entities"): ner.add_label(ent[2]) optimizer = nlp.initialize() for i in range(50): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses["ner"] < 0.00001 # test the trained model test_text = "I like London." doc = nlp(test_text) ents = doc.ents assert len(ents) == 1 assert ents[0].text == "London" assert ents[0].label_ == "LOC" # Also test the results are still the same after IO with make_tempdir() as tmp_dir: nlp.to_disk(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir) doc2 = nlp2(test_text) ents2 = doc2.ents assert len(ents2) == 1 assert ents2[0].text == "London" assert ents2[0].label_ == "LOC" # Ensure that the predictions are still the same, even after adding a new label ner2 = nlp2.get_pipe("ner") assert ner2.model.attrs["has_upper"] == use_upper ner2.add_label("RANDOM_NEW_LABEL") doc3 = nlp2(test_text) ents3 = doc3.ents assert len(ents3) == 1 assert ents3[0].text == "London" assert ents3[0].label_ == "LOC" # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions texts = [ "Just a sentence.", "Then one more sentence about London.", "Here is another one.", "I like London.", ] batch_deps_1 = [doc.to_array([ENT_IOB]) for doc in nlp.pipe(texts)] batch_deps_2 = [doc.to_array([ENT_IOB]) for doc in nlp.pipe(texts)] no_batch_deps = [ doc.to_array([ENT_IOB]) for doc in [nlp(text) for text in texts] ] assert_equal(batch_deps_1, batch_deps_2) assert_equal(batch_deps_1, no_batch_deps)
def test_beam_overfitting_IO(neg_key): # Simple test to try and quickly overfit the Beam NER component nlp = English() beam_width = 16 beam_density = 0.0001 config = { "beam_width": beam_width, "beam_density": beam_density, "incorrect_spans_key": neg_key, } ner = nlp.add_pipe("beam_ner", config=config) train_examples = [] for text, annotations in TRAIN_DATA: train_examples.append( Example.from_dict(nlp.make_doc(text), annotations)) for ent in annotations.get("entities"): ner.add_label(ent[2]) optimizer = nlp.initialize() # run overfitting for i in range(50): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses["beam_ner"] < 0.0001 # test the scores from the beam test_text = "I like London" docs = [nlp.make_doc(test_text)] beams = ner.predict(docs) entity_scores = ner.scored_ents(beams)[0] assert entity_scores[(2, 3, "LOC")] == 1.0 assert entity_scores[(2, 3, "PERSON")] == 0.0 assert len(nlp(test_text).ents) == 1 # Also test the results are still the same after IO with make_tempdir() as tmp_dir: nlp.to_disk(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir) docs2 = [nlp2.make_doc(test_text)] ner2 = nlp2.get_pipe("beam_ner") beams2 = ner2.predict(docs2) entity_scores2 = ner2.scored_ents(beams2)[0] assert entity_scores2[(2, 3, "LOC")] == 1.0 assert entity_scores2[(2, 3, "PERSON")] == 0.0 # Try to unlearn the entity by using negative annotations neg_doc = nlp.make_doc(test_text) neg_ex = Example(neg_doc, neg_doc) neg_ex.reference.spans[neg_key] = [Span(neg_doc, 2, 3, "LOC")] neg_train_examples = [neg_ex] for i in range(20): losses = {} nlp.update(neg_train_examples, sgd=optimizer, losses=losses) # test the "untrained" model assert len(nlp(test_text).ents) == 0
def test_transformer_pipeline_tagger_senter_listener(): """Test that a pipeline with just a transformer+tagger+senter runs and trains properly""" orig_config = Config().from_str(cfg_string) nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) assert nlp.pipe_names == ["transformer", "tagger", "senter"] tagger = nlp.get_pipe("tagger") transformer = nlp.get_pipe("transformer") tagger_trf = tagger.model.get_ref("tok2vec").layers[0] assert isinstance(transformer, Transformer) assert isinstance(tagger_trf, TransformerListener) assert tagger_trf.upstream_name == "custom_upstream" train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) for tag in t[1]["tags"]: tagger.add_label(tag) # Check that the Transformer component finds it listeners assert transformer.listeners == [] optimizer = nlp.initialize(lambda: train_examples) assert tagger_trf in transformer.listeners for i in range(2): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) text = "We're interested at underwater basket weaving." doc = nlp(text) doc_tensor = tagger_trf.predict([doc]) _assert_equal_tensors(doc._.trf_data.tensors, doc_tensor[0].tensors) # ensure IO goes OK with make_tempdir() as d: file_path = d / "trained_nlp" nlp.to_disk(file_path) nlp2 = util.load_model_from_path(file_path) doc2 = nlp2(text) tagger2 = nlp2.get_pipe("tagger") tagger_trf2 = tagger2.model.get_ref("tok2vec").layers[0] doc_tensor2 = tagger_trf2.predict([doc2]) _assert_equal_tensors(doc_tensor2[0].tensors, doc_tensor[0].tensors) # make sure that this can be saved to directory once more file_path_2 = d / "trained_nlp_2" nlp2.to_disk(file_path_2) # ensure to_bytes / from_bytes works nlp_bytes = nlp.to_bytes() nlp3 = util.load_model_from_config(orig_config, auto_fill=True, validate=True) nlp3.from_bytes(nlp_bytes) doc3 = nlp3(text) tagger3 = nlp3.get_pipe("tagger") tagger_trf3 = tagger3.model.get_ref("tok2vec").layers[0] doc_tensor3 = tagger_trf3.predict([doc3]) _assert_equal_tensors(doc_tensor3[0].tensors, doc_tensor[0].tensors)
def test_overfitting_IO(): # Simple test to try and quickly overfit the single-label textcat component - ensuring the ML models work correctly fix_random_seed(0) nlp = English() textcat = nlp.add_pipe("textcat") train_examples = [] for text, annotations in TRAIN_DATA_SINGLE_LABEL: train_examples.append( Example.from_dict(nlp.make_doc(text), annotations)) optimizer = nlp.initialize(get_examples=lambda: train_examples) assert textcat.model.get_dim("nO") == 2 for i in range(50): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses["textcat"] < 0.01 # test the trained model test_text = "I am happy." doc = nlp(test_text) cats = doc.cats assert cats["POSITIVE"] > 0.9 assert cats["POSITIVE"] + cats["NEGATIVE"] == pytest.approx(1.0, 0.001) # Also test the results are still the same after IO with make_tempdir() as tmp_dir: nlp.to_disk(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir) doc2 = nlp2(test_text) cats2 = doc2.cats assert cats2["POSITIVE"] > 0.9 assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx( 1.0, 0.001) # Test scoring scores = nlp.evaluate(train_examples) assert scores["cats_micro_f"] == 1.0 assert scores["cats_macro_f"] == 1.0 assert scores["cats_macro_auc"] == 1.0 assert scores["cats_score"] == 1.0 assert "cats_score_desc" in scores # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions texts = [ "Just a sentence.", "I like green eggs.", "I am happy.", "I eat ham." ] batch_cats_1 = [doc.cats for doc in nlp.pipe(texts)] batch_cats_2 = [doc.cats for doc in nlp.pipe(texts)] no_batch_cats = [doc.cats for doc in [nlp(text) for text in texts]] for cats_1, cats_2 in zip(batch_cats_1, batch_cats_2): for cat in cats_1: assert_almost_equal(cats_1[cat], cats_2[cat], decimal=5) for cats_1, cats_2 in zip(batch_cats_1, no_batch_cats): for cat in cats_1: assert_almost_equal(cats_1[cat], cats_2[cat], decimal=5)
def test_overfitting_IO(): # Simple test to try and quickly overfit the senter - ensuring the ML models work correctly nlp = English() train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) # add some cases where SENT_START == -1 train_examples[0].reference[10].is_sent_start = False train_examples[1].reference[1].is_sent_start = False train_examples[1].reference[11].is_sent_start = False nlp.add_pipe("senter") optimizer = nlp.initialize() for i in range(200): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses["senter"] < 0.001 # test the trained model test_text = TRAIN_DATA[0][0] doc = nlp(test_text) gold_sent_starts = [0] * 14 gold_sent_starts[0] = 1 gold_sent_starts[5] = 1 gold_sent_starts[9] = 1 assert [int(t.is_sent_start) for t in doc] == gold_sent_starts # Also test the results are still the same after IO with make_tempdir() as tmp_dir: nlp.to_disk(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir) doc2 = nlp2(test_text) assert [int(t.is_sent_start) for t in doc2] == gold_sent_starts # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions texts = [ "Just a sentence.", "Then one more sentence about London.", "Here is another one.", "I like London.", ] batch_deps_1 = [doc.to_array([SENT_START]) for doc in nlp.pipe(texts)] batch_deps_2 = [doc.to_array([SENT_START]) for doc in nlp.pipe(texts)] no_batch_deps = [ doc.to_array([SENT_START]) for doc in [nlp(text) for text in texts] ] assert_equal(batch_deps_1, batch_deps_2) assert_equal(batch_deps_1, no_batch_deps) # test internal pipe labels vs. Language.pipe_labels with hidden labels assert nlp.get_pipe("senter").labels == ("I", "S") assert "senter" not in nlp.pipe_labels
def test_overfitting_IO(): nlp = English() lemmatizer = nlp.add_pipe("trainable_lemmatizer") lemmatizer.min_tree_freq = 1 train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) optimizer = nlp.initialize(get_examples=lambda: train_examples) for i in range(50): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses["trainable_lemmatizer"] < 0.00001 test_text = "She likes blue eggs" doc = nlp(test_text) assert doc[0].lemma_ == "she" assert doc[1].lemma_ == "like" assert doc[2].lemma_ == "blue" assert doc[3].lemma_ == "egg" # Check model after a {to,from}_disk roundtrip with util.make_tempdir() as tmp_dir: nlp.to_disk(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir) doc2 = nlp2(test_text) assert doc2[0].lemma_ == "she" assert doc2[1].lemma_ == "like" assert doc2[2].lemma_ == "blue" assert doc2[3].lemma_ == "egg" # Check model after a {to,from}_bytes roundtrip nlp_bytes = nlp.to_bytes() nlp3 = English() nlp3.add_pipe("trainable_lemmatizer") nlp3.from_bytes(nlp_bytes) doc3 = nlp3(test_text) assert doc3[0].lemma_ == "she" assert doc3[1].lemma_ == "like" assert doc3[2].lemma_ == "blue" assert doc3[3].lemma_ == "egg" # Check model after a pickle roundtrip. nlp_bytes = pickle.dumps(nlp) nlp4 = pickle.loads(nlp_bytes) doc4 = nlp4(test_text) assert doc4[0].lemma_ == "she" assert doc4[1].lemma_ == "like" assert doc4[2].lemma_ == "blue" assert doc4[3].lemma_ == "egg"
def test_overfitting_IO(pipe_name): # Simple test to try and quickly overfit the dependency parser (normal or beam) nlp = English() parser = nlp.add_pipe(pipe_name) train_examples = [] for text, annotations in TRAIN_DATA: train_examples.append( Example.from_dict(nlp.make_doc(text), annotations)) for dep in annotations.get("deps", []): parser.add_label(dep) optimizer = nlp.initialize() # run overfitting for i in range(200): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses[pipe_name] < 0.0001 # test the trained model test_text = "I like securities." doc = nlp(test_text) assert doc[0].dep_ == "nsubj" assert doc[2].dep_ == "dobj" assert doc[3].dep_ == "punct" assert doc[0].head.i == 1 assert doc[2].head.i == 1 assert doc[3].head.i == 1 # Also test the results are still the same after IO with make_tempdir() as tmp_dir: nlp.to_disk(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir) doc2 = nlp2(test_text) assert doc2[0].dep_ == "nsubj" assert doc2[2].dep_ == "dobj" assert doc2[3].dep_ == "punct" assert doc2[0].head.i == 1 assert doc2[2].head.i == 1 assert doc2[3].head.i == 1 # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions texts = [ "Just a sentence.", "Then one more sentence about London.", "Here is another one.", "I like London.", ] batch_deps_1 = [doc.to_array([DEP]) for doc in nlp.pipe(texts)] batch_deps_2 = [doc.to_array([DEP]) for doc in nlp.pipe(texts)] no_batch_deps = [ doc.to_array([DEP]) for doc in [nlp(text) for text in texts] ] assert_equal(batch_deps_1, batch_deps_2) assert_equal(batch_deps_1, no_batch_deps)
def load_spacy_model(cache_dir=DEFAULT_CACHE_DIR, verbose=False): """ Loads a spacy model. """ from spacy.util import load_model_from_path model_weight_path = download_model('spacy', cache_dir, process_func=_unzip_process_func, verbose=verbose) nlp = load_model_from_path(model_weight_path) return nlp
def test_issue4707(): """Tests that disabled component names are also excluded from nlp.from_disk by default when loading a model. """ nlp = English() nlp.add_pipe(nlp.create_pipe("sentencizer")) nlp.add_pipe(nlp.create_pipe("entity_ruler")) assert nlp.pipe_names == ["sentencizer", "entity_ruler"] exclude = ["tokenizer", "sentencizer"] with make_tempdir() as tmpdir: nlp.to_disk(tmpdir, exclude=exclude) new_nlp = load_model_from_path(tmpdir, disable=exclude) assert "sentencizer" not in new_nlp.pipe_names assert "entity_ruler" in new_nlp.pipe_names
def load_spacy_model(cache_dir=DEFAULT_CACHE_DIR, verbose=False, textcat=None, vectorError=False): """ Loads a spaCy model. :param str cache_dir: the directory for storing cached models :param bool verbose: `True` to increase verbosity :param bool textcat: '`sentiment`' for loading the spaCy sentiment analyser :param bool vectorError: :return: a spaCy model .. warning:: vectorError is a temporary work around error encounted by keeping two models and not been able to find reference name for vectors """ from spacy.util import load_model_from_path if textcat==None or vectorError==True: modelname='spacy' model_weight_path = download_model(modelname, cache_dir, process_func=_unzip_process_func, verbose=verbose) nlp = load_model_from_path(model_weight_path) if textcat=='sentiment': modelname='spacy.sentiment' model_weight_path = download_model(modelname, cache_dir, process_func=_unzip_process_func, verbose=verbose) # quick fix from not aligned models storage: import os model_weight_path = os.path.join(model_weight_path, 'spacy.sentiment') nlp = load_model_from_path(model_weight_path) return nlp
def test_overfitting_IO_multi(): # Simple test to try and quickly overfit the multi-label textcat component - ensuring the ML models work correctly fix_random_seed(0) nlp = English() textcat = nlp.add_pipe("textcat_multilabel") train_examples = [] for text, annotations in TRAIN_DATA_MULTI_LABEL: train_examples.append( Example.from_dict(nlp.make_doc(text), annotations)) optimizer = nlp.initialize(get_examples=lambda: train_examples) assert textcat.model.get_dim("nO") == 3 for i in range(100): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses["textcat_multilabel"] < 0.01 # test the trained model test_text = "I am confused but happy." doc = nlp(test_text) cats = doc.cats assert cats["HAPPY"] > 0.9 assert cats["CONFUSED"] > 0.9 # Also test the results are still the same after IO with make_tempdir() as tmp_dir: nlp.to_disk(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir) doc2 = nlp2(test_text) cats2 = doc2.cats assert cats2["HAPPY"] > 0.9 assert cats2["CONFUSED"] > 0.9 # Test scoring scores = nlp.evaluate(train_examples) assert scores["cats_micro_f"] == 1.0 assert scores["cats_macro_f"] == 1.0 assert "cats_score_desc" in scores # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions texts = [ "Just a sentence.", "I like green eggs.", "I am happy.", "I eat ham." ] batch_deps_1 = [doc.cats for doc in nlp.pipe(texts)] batch_deps_2 = [doc.cats for doc in nlp.pipe(texts)] no_batch_deps = [doc.cats for doc in [nlp(text) for text in texts]] assert_equal(batch_deps_1, batch_deps_2) assert_equal(batch_deps_1, no_batch_deps)
def test_issue999(): """Test that adding entities and resuming training works passably OK. There are two issues here: 1) We have to re-add labels. This isn't very nice. 2) There's no way to set the learning rate for the weight update, so we end up out-of-scale, causing it to learn too fast. """ TRAIN_DATA = [ ["hey", []], ["howdy", []], ["hey there", []], ["hello", []], ["hi", []], ["i'm looking for a place to eat", []], [ "i'm looking for a place in the north of town", [(31, 36, "LOCATION")] ], ["show me chinese restaurants", [(8, 15, "CUISINE")]], ["show me chines restaurants", [(8, 14, "CUISINE")]], ] nlp = English() ner = nlp.add_pipe("ner") for _, offsets in TRAIN_DATA: for start, end, label in offsets: ner.add_label(label) nlp.initialize() for itn in range(20): random.shuffle(TRAIN_DATA) for raw_text, entity_offsets in TRAIN_DATA: example = Example.from_dict(nlp.make_doc(raw_text), {"entities": entity_offsets}) nlp.update([example]) with make_tempdir() as model_dir: nlp.to_disk(model_dir) nlp2 = util.load_model_from_path(model_dir) for raw_text, entity_offsets in TRAIN_DATA: doc = nlp2(raw_text) ents = {(ent.start_char, ent.end_char): ent.label_ for ent in doc.ents} for start, end, label in entity_offsets: if (start, end) in ents: assert ents[(start, end)] == label break else: if entity_offsets: raise Exception(ents)
def test_overfitting_IO(): # Simple test to try and quickly overfit the spancat component - ensuring the ML models work correctly fix_random_seed(0) nlp = English() spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) train_examples = make_examples(nlp) optimizer = nlp.initialize(get_examples=lambda: train_examples) assert spancat.model.get_dim("nO") == 2 assert set(spancat.labels) == {"LOC", "PERSON"} for i in range(50): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses["spancat"] < 0.01 # test the trained model test_text = "I like London and Berlin" doc = nlp(test_text) assert doc.spans[spancat.key] == doc.spans[SPAN_KEY] spans = doc.spans[SPAN_KEY] assert len(spans) == 2 assert len(spans.attrs["scores"]) == 2 assert min(spans.attrs["scores"]) > 0.9 assert set([span.text for span in spans]) == {"London", "Berlin"} assert set([span.label_ for span in spans]) == {"LOC"} # Also test the results are still the same after IO with make_tempdir() as tmp_dir: nlp.to_disk(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir) doc2 = nlp2(test_text) spans2 = doc2.spans[SPAN_KEY] assert len(spans2) == 2 assert len(spans2.attrs["scores"]) == 2 assert min(spans2.attrs["scores"]) > 0.9 assert set([span.text for span in spans2]) == {"London", "Berlin"} assert set([span.label_ for span in spans2]) == {"LOC"} # Test scoring scores = nlp.evaluate(train_examples) assert f"spans_{SPAN_KEY}_f" in scores assert scores[f"spans_{SPAN_KEY}_p"] == 1.0 assert scores[f"spans_{SPAN_KEY}_r"] == 1.0 assert scores[f"spans_{SPAN_KEY}_f"] == 1.0 # also test that the spancat works for just a single entity in a sentence doc = nlp("London") assert len(doc.spans[spancat.key]) == 1
def test_transformer_sentencepiece_IO(): """Test that a transformer using sentencepiece trains + IO goes OK""" orig_config = Config().from_str(cfg_string) orig_config["components"]["transformer"]["model"]["name"] = "camembert-base" nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) tagger = nlp.get_pipe("tagger") tagger_trf = tagger.model.get_ref("tok2vec").layers[0] train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) for tag in t[1]["tags"]: tagger.add_label(tag) optimizer = nlp.initialize(lambda: train_examples) for i in range(2): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) text = "We're interested at underwater basket weaving." doc = nlp(text) doc_tensor = tagger_trf.predict([doc]) # ensure IO goes OK with make_tempdir() as d: file_path = d / "trained_nlp" nlp.to_disk(file_path) nlp2 = util.load_model_from_path(file_path) doc2 = nlp2(text) tagger2 = nlp2.get_pipe("tagger") tagger_trf2 = tagger2.model.get_ref("tok2vec").layers[0] doc_tensor2 = tagger_trf2.predict([doc2]) _assert_equal_tensors(doc_tensor2[0].tensors, doc_tensor[0].tensors) # make sure that this can be saved to directory once more file_path_2 = d / "trained_nlp_2" nlp2.to_disk(file_path_2) # ensure to_bytes / from_bytes works nlp_bytes = nlp.to_bytes() nlp3 = util.load_model_from_config(orig_config, auto_fill=True, validate=True) nlp3.from_bytes(nlp_bytes) doc3 = nlp3(text) tagger3 = nlp3.get_pipe("tagger") tagger_trf3 = tagger3.model.get_ref("tok2vec").layers[0] doc_tensor3 = tagger_trf3.predict([doc3]) _assert_equal_tensors(doc_tensor3[0].tensors, doc_tensor[0].tensors)
def test_overfitting_IO_overlapping(): # Test for overfitting on overlapping entities fix_random_seed(0) nlp = English() spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) train_examples = make_examples(nlp, data=TRAIN_DATA_OVERLAPPING) optimizer = nlp.initialize(get_examples=lambda: train_examples) assert spancat.model.get_dim("nO") == 3 assert set(spancat.labels) == {"PERSON", "LOC", "DOUBLE_LOC"} for i in range(50): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses["spancat"] < 0.01 # test the trained model test_text = "I like London and Berlin" doc = nlp(test_text) spans = doc.spans[SPAN_KEY] assert len(spans) == 3 assert len(spans.attrs["scores"]) == 3 assert min(spans.attrs["scores"]) > 0.9 assert set([span.text for span in spans]) == { "London", "Berlin", "London and Berlin", } assert set([span.label_ for span in spans]) == {"LOC", "DOUBLE_LOC"} # Also test the results are still the same after IO with make_tempdir() as tmp_dir: nlp.to_disk(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir) doc2 = nlp2(test_text) spans2 = doc2.spans[SPAN_KEY] assert len(spans2) == 3 assert len(spans2.attrs["scores"]) == 3 assert min(spans2.attrs["scores"]) > 0.9 assert set([span.text for span in spans2]) == { "London", "Berlin", "London and Berlin", } assert set([span.label_ for span in spans2]) == {"LOC", "DOUBLE_LOC"}
def test_attributeruler_serialize(nlp, pattern_dicts): a = nlp.add_pipe("attribute_ruler") a.add_patterns(pattern_dicts) text = "This is a test." attrs = ["ORTH", "LEMMA", "MORPH"] doc = nlp(text) # bytes roundtrip a_reloaded = AttributeRuler(nlp.vocab).from_bytes(a.to_bytes()) assert a.to_bytes() == a_reloaded.to_bytes() doc1 = a_reloaded(nlp.make_doc(text)) numpy.array_equal(doc.to_array(attrs), doc1.to_array(attrs)) assert a.patterns == a_reloaded.patterns # disk roundtrip with make_tempdir() as tmp_dir: nlp.to_disk(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir) doc2 = nlp2(text) assert nlp2.get_pipe("attribute_ruler").to_bytes() == a.to_bytes() assert numpy.array_equal(doc.to_array(attrs), doc2.to_array(attrs)) assert a.patterns == nlp2.get_pipe("attribute_ruler").patterns
def __call__(self, train_data, dev_data, test_data, n_iter, dropout, patience=10): if "ner" in self.nlp.pipe_names: logging.warning("Pipeline already has NER, removing...") self.nlp.remove_pipe("ner") ner = self.nlp.create_pipe("ner") # ner.add_multitask_objective(get_position_label) self.nlp.add_pipe(ner, last=True) for sent, ann in train_data + dev_data + test_data: for _, _, label in ann: ner.add_label(label) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in self.nlp.pipe_names if pipe != "ner"] logging.info("Starting the training with {} iterations".format(n_iter)) fix_random_seed(42) best_dev_score = 0 best_dev_itn = -1 with self.nlp.disable_pipes(*other_pipes): # only train NER # if not self.model: # FIXME: pre-train the model mlflow.log_param("base_model", Path(self.model).name) mlflow.log_param("n_iter", n_iter) mlflow.log_param("dropout", dropout) self.nlp.begin_training() for itn in tqdm(list(range(n_iter)), desc="iterations"): random.shuffle(train_data) losses = {} # batch up the examples using spaCy's minibatch batches = list( minibatch(train_data, size=compounding(4.0, 32.0, 1.001))) for batch in tqdm(batches, desc="batches"): texts, annotations = zip(*batch) self.nlp.update( docs=texts, # batch of texts golds=[ self._sent_to_goldparse(t, a) for t, a in batch ], # batch of annotations drop= dropout, # dropout - make it harder to memorise data losses=losses, ) # logging.info("Losses: {}".format(losses)) mlflow.log_metric("loss", losses["ner"], step=itn) train_scores = self.evaluate(train_data) mlflow.log_metrics( {"train_" + k: v for k, v in train_scores.items()}, step=itn) dev_scores = self.evaluate(dev_data) mlflow.log_metrics( {"dev_" + k: v for k, v in dev_scores.items()}, step=itn) self.store_model( Path(self.output_dir) / "model_{}".format(itn)) act_score = dev_scores["f1"] if best_dev_score < act_score: best_dev_score = act_score best_dev_itn = itn else: if itn - best_dev_itn >= patience: break load_model_from_path( Path(self.output_dir) / "model_{}".format(best_dev_itn)) scores = self.evaluate(test_data) logging.info("Test scores {}".format(scores)) mlflow.log_metrics(scores, step=itn) self.store_model(Path(self.output_dir) / "model-final")
def test_overfitting_IO(): # Simple test to try and quickly overfit the morphologizer - ensuring the ML models work correctly nlp = English() nlp.add_pipe("morphologizer") train_examples = [] for inst in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1])) optimizer = nlp.initialize(get_examples=lambda: train_examples) for i in range(50): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses["morphologizer"] < 0.00001 # test the trained model test_text = "I like blue ham" doc = nlp(test_text) gold_morphs = ["Feat=N", "Feat=V", "", ""] gold_pos_tags = ["NOUN", "VERB", "ADJ", ""] assert [str(t.morph) for t in doc] == gold_morphs assert [t.pos_ for t in doc] == gold_pos_tags # Also test the results are still the same after IO with make_tempdir() as tmp_dir: nlp.to_disk(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir) doc2 = nlp2(test_text) assert [str(t.morph) for t in doc2] == gold_morphs assert [t.pos_ for t in doc2] == gold_pos_tags # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions texts = [ "Just a sentence.", "Then one more sentence about London.", "Here is another one.", "I like London.", ] batch_deps_1 = [doc.to_array([MORPH]) for doc in nlp.pipe(texts)] batch_deps_2 = [doc.to_array([MORPH]) for doc in nlp.pipe(texts)] no_batch_deps = [ doc.to_array([MORPH]) for doc in [nlp(text) for text in texts] ] assert_equal(batch_deps_1, batch_deps_2) assert_equal(batch_deps_1, no_batch_deps) # Test without POS nlp.remove_pipe("morphologizer") nlp.add_pipe("morphologizer") for example in train_examples: for token in example.reference: token.pos_ = "" optimizer = nlp.initialize(get_examples=lambda: train_examples) for i in range(50): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses["morphologizer"] < 0.00001 # Test the trained model test_text = "I like blue ham" doc = nlp(test_text) gold_morphs = ["Feat=N", "Feat=V", "", ""] gold_pos_tags = ["", "", "", ""] assert [str(t.morph) for t in doc] == gold_morphs assert [t.pos_ for t in doc] == gold_pos_tags # Test with unset morph and partial POS nlp.remove_pipe("morphologizer") nlp.add_pipe("morphologizer") for example in train_examples: for token in example.reference: if token.text == "ham": token.pos_ = "NOUN" else: token.pos_ = "" token.set_morph(None) optimizer = nlp.initialize(get_examples=lambda: train_examples) print(nlp.get_pipe("morphologizer").labels) for i in range(50): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses["morphologizer"] < 0.00001 # Test the trained model test_text = "I like blue ham" doc = nlp(test_text) gold_morphs = ["", "", "", ""] gold_pos_tags = ["NOUN", "NOUN", "NOUN", "NOUN"] assert [str(t.morph) for t in doc] == gold_morphs assert [t.pos_ for t in doc] == gold_pos_tags
def train(model, train_data, dev_data, test_data, output_dir, n_iter, meta_overrides): """Load the model, set up the pipeline and train the entity recognizer.""" if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank('en') # create blank Language class print("Created blank 'en' model") if meta_overrides is not None: metadata = json.load(open(meta_overrides)) nlp.meta.update(metadata) original_tokenizer = nlp.tokenizer nlp.tokenizer = WhitespaceTokenizer(nlp.vocab) # create the built-in pipeline components and add them to the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if 'ner' not in nlp.pipe_names and "parser" in nlp.pipe_names: ner = nlp.create_pipe('ner') nlp.add_pipe(ner, after="parser") elif 'ner' not in nlp.pipe_names and "tagger" in nlp.pipe_names: ner = nlp.create_pipe('ner') nlp.add_pipe(ner, after="tagger") elif 'ner' not in nlp.pipe_names: ner = nlp.create_pipe('ner') nlp.add_pipe(ner, last=True) # otherwise, get it so we can add labels else: ner = nlp.get_pipe('ner') # add labels for _, annotations in train_data: for ent in annotations.get('entities'): ner.add_label(ent[2]) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2), util.env_opt('dropout_to', 0.2), util.env_opt('dropout_decay', 0.005)) batch_sizes = util.compounding(util.env_opt('batch_from', 1), util.env_opt('batch_to', 32), util.env_opt('batch_compound', 1.001)) with nlp.disable_pipes(*other_pipes): optimizer = nlp.begin_training() best_epoch = 0 best_f1 = 0 for i in range(n_iter): random.shuffle(train_data) count = 0 losses = {} total = len(train_data) with nlp.disable_pipes(*other_pipes): # only train NER with tqdm.tqdm(total=total, leave=True) as pbar: for batch in minibatch(train_data, size=batch_sizes): docs, golds = zip(*batch) nlp.update(docs, golds, sgd=optimizer, losses=losses, drop=next(dropout_rates)) pbar.update(len(batch)) if count % 100 == 0 and count > 0: print('sum loss: %s' % losses['ner']) count += 1 # save model to output directory output_dir_path = Path(output_dir + "/" + str(i)) if not output_dir_path.exists(): output_dir_path.mkdir() with nlp.use_params(optimizer.averages): nlp.tokenizer = original_tokenizer nlp.to_disk(output_dir_path) print("Saved model to", output_dir_path) # test the saved model print("Loading from", output_dir_path) nlp2 = util.load_model_from_path(output_dir_path) nlp2.tokenizer = WhitespaceTokenizer(nlp.vocab) metrics = evaluate_ner(nlp2, dev_data) if metrics["f1-measure-overall"] > best_f1: best_f1 = metrics["f1-measure-overall"] best_epoch = i # save model to output directory best_model_path = Path(output_dir + "/" + "best") print(f"Best Epoch: {best_epoch} of {n_iter}") if os.path.exists(best_model_path): shutil.rmtree(best_model_path) shutil.copytree(os.path.join(output_dir, str(best_epoch)), best_model_path) # test the saved model print("Loading from", best_model_path) nlp2 = util.load_model_from_path(best_model_path) nlp2.tokenizer = WhitespaceTokenizer(nlp.vocab) evaluate_ner(nlp2, dev_data, dump_path=os.path.join(output_dir, "dev_metrics.json")) evaluate_ner(nlp2, test_data, dump_path=os.path.join(output_dir, "test_metrics.json"))
def train_parser_and_tagger(train_json_path: str, dev_json_path: str, test_json_path: str, model_output_dir: str, model_path: str = None, ontonotes_path: str = None, ontonotes_train_percent: float = 0.0): """Function to train the spacy parser and tagger from a blank model, with the default, en_core_web_sm vocab. Training setup is mostly copied from the spacy cli train command. @param train_json_path: path to the conll formatted training data @param dev_json_path: path to the conll formatted dev data @param test_json_path: path to the conll formatted test data @param model_output_dir: path to the output directory for the trained models @param model_path: path to the model to load @param ontonotes_path: path to the directory containnig ontonotes in spacy format (optional) @param ontonotes_train_percent: percentage of the ontonotes training data to use (optional) """ msg = Printer() train_json_path = cached_path(train_json_path) dev_json_path = cached_path(dev_json_path) test_json_path = cached_path(test_json_path) if model_path is not None: nlp = spacy.load(model_path) else: lang_class = util.get_lang_class('en') nlp = lang_class() if 'tagger' not in nlp.pipe_names: tagger = nlp.create_pipe('tagger') nlp.add_pipe(tagger, first=True) else: tagger = nlp.get_pipe('tagger') if 'parser' not in nlp.pipe_names: parser = nlp.create_pipe('parser') nlp.add_pipe(parser) else: parser = nlp.get_pipe('parser') train_corpus = GoldCorpus(train_json_path, dev_json_path) test_corpus = GoldCorpus(train_json_path, test_json_path) if ontonotes_path: onto_train_path = os.path.join(ontonotes_path, "train") onto_dev_path = os.path.join(ontonotes_path, "dev") onto_test_path = os.path.join(ontonotes_path, "test") onto_train_corpus = GoldCorpus(onto_train_path, onto_dev_path) onto_test_corpus = GoldCorpus(onto_train_path, onto_test_path) dropout_rates = util.decaying(0.2, 0.2, 0.0) batch_sizes = util.compounding(1., 16., 1.001) if model_path is not None: meta = nlp.meta else: meta = {} meta["lang"] = "en" meta["pipeline"] = ["tagger", "parser"] meta["name"] = "scispacy_core_web_sm" meta["license"] = "CC BY-SA 3.0" meta["author"] = "Allen Institute for Artificial Intelligence" meta["url"] = "allenai.org" meta["sources"] = ["OntoNotes 5", "Common Crawl", "GENIA 1.0"] meta["version"] = "1.0.0" meta["spacy_version"] = ">=2.2.1" meta["parent_package"] = "spacy" meta["email"] = "*****@*****.**" n_train_words = train_corpus.count_train() other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in ['tagger', 'parser']] with nlp.disable_pipes(*other_pipes): if ontonotes_path: optimizer = nlp.begin_training(lambda: itertools.chain(train_corpus.train_tuples, onto_train_corpus.train_tuples)) else: optimizer = nlp.begin_training(lambda: train_corpus.train_tuples) nlp._optimizer = None train_docs = train_corpus.train_docs(nlp) train_docs = list(train_docs) train_mixture = train_docs if ontonotes_path: onto_train_docs = onto_train_corpus.train_docs(nlp) onto_train_docs = list(onto_train_docs) num_onto_docs = int(float(ontonotes_train_percent)*len(onto_train_docs)) randomly_sampled_onto = random.sample(onto_train_docs, num_onto_docs) train_mixture += randomly_sampled_onto row_head, output_stats = _configure_training_output(nlp.pipe_names, -1, False) row_widths = [len(w) for w in row_head] row_settings = {"widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2} print("") msg.row(row_head, **row_settings) msg.row(["-" * width for width in row_settings["widths"]], **row_settings) best_epoch = 0 best_epoch_uas = 0.0 for i in range(20): random.shuffle(train_mixture) with nlp.disable_pipes(*other_pipes): with tqdm(total=n_train_words, leave=False) as pbar: losses = {} minibatches = list(util.minibatch(train_docs, size=batch_sizes)) for batch in minibatches: docs, golds = zip(*batch) nlp.update(docs, golds, sgd=optimizer, drop=next(dropout_rates), losses=losses) pbar.update(sum(len(doc) for doc in docs)) # save intermediate model and output results on the dev set with nlp.use_params(optimizer.averages): epoch_model_path = os.path.join(model_output_dir, "model"+str(i)) os.makedirs(epoch_model_path, exist_ok=True) nlp.to_disk(epoch_model_path) with open(os.path.join(model_output_dir, "model"+str(i), "meta.json"), "w") as meta_fp: meta_fp.write(json.dumps(meta)) nlp_loaded = util.load_model_from_path(epoch_model_path) dev_docs = train_corpus.dev_docs(nlp_loaded) dev_docs = list(dev_docs) nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs) end_time = timer() gpu_wps = None cpu_wps = nwords/(end_time-start_time) if ontonotes_path: onto_dev_docs = list(onto_train_corpus.dev_docs(nlp_loaded)) onto_scorer = nlp_loaded.evaluate(onto_dev_docs) if scorer.scores["uas"] > best_epoch_uas: best_epoch_uas = scorer.scores["uas"] best_epoch = i progress = _get_progress( i, losses, scorer.scores, output_stats, cpu_wps=cpu_wps, gpu_wps=gpu_wps ) msg.row(progress, **row_settings) if ontonotes_path: progress = _get_progress( i, losses, onto_scorer.scores, output_stats, cpu_wps=cpu_wps, gpu_wps=gpu_wps ) msg.row(progress, **row_settings) # save final model and output results on the test set final_model_path = os.path.join(model_output_dir, "best") if os.path.exists(final_model_path): shutil.rmtree(final_model_path) shutil.copytree(os.path.join(model_output_dir, "model" + str(best_epoch)), final_model_path) nlp_loaded = util.load_model_from_path(final_model_path) start_time = timer() test_docs = test_corpus.dev_docs(nlp_loaded) test_docs = list(test_docs) nwords = sum(len(doc_gold[0]) for doc_gold in test_docs) scorer = nlp_loaded.evaluate(test_docs) end_time = timer() gpu_wps = None cpu_wps = nwords/(end_time-start_time) meta["speed"] = {"gpu": None, "nwords": nwords, "cpu": cpu_wps} print("Retrained genia evaluation") print("Test results:") print("UAS:", scorer.uas) print("LAS:", scorer.las) print("Tag %:", scorer.tags_acc) print("Token acc:", scorer.token_acc) with open(os.path.join(model_output_dir, "genia_test.json"), "w+") as metric_file: json.dump(scorer.scores, metric_file) with open(os.path.join(model_output_dir, "best", "meta.json"), "w") as meta_fp: meta_fp.write(json.dumps(meta)) if ontonotes_path: onto_test_docs = list(onto_test_corpus.dev_docs(nlp_loaded)) print("Retrained ontonotes evaluation") scorer_onto_retrained = nlp_loaded.evaluate(onto_test_docs) print("Test results:") print("UAS:", scorer_onto_retrained.uas) print("LAS:", scorer_onto_retrained.las) print("Tag %:", scorer_onto_retrained.tags_acc) print("Token acc:", scorer_onto_retrained.token_acc) with open(os.path.join(model_output_dir, "ontonotes_test.json"), "w+") as metric_file: json.dump(scorer_onto_retrained.scores, metric_file)
def test_serialize_subclassed_kb(): """Check that IO of a custom KB works fine as part of an EL pipe.""" config_string = """ [nlp] lang = "en" pipeline = ["entity_linker"] [components] [components.entity_linker] factory = "entity_linker" [initialize] [initialize.components] [initialize.components.entity_linker] [initialize.components.entity_linker.kb_loader] @misc = "spacy.CustomKB.v1" entity_vector_length = 342 custom_field = 666 """ class SubKnowledgeBase(KnowledgeBase): def __init__(self, vocab, entity_vector_length, custom_field): super().__init__(vocab, entity_vector_length) self.custom_field = custom_field @registry.misc("spacy.CustomKB.v1") def custom_kb(entity_vector_length: int, custom_field: int) -> Callable[[Vocab], KnowledgeBase]: def custom_kb_factory(vocab): kb = SubKnowledgeBase( vocab=vocab, entity_vector_length=entity_vector_length, custom_field=custom_field, ) kb.add_entity("random_entity", 0.0, zeros(entity_vector_length)) return kb return custom_kb_factory config = Config().from_str(config_string) nlp = load_model_from_config(config, auto_fill=True) nlp.initialize() entity_linker = nlp.get_pipe("entity_linker") assert type(entity_linker.kb) == SubKnowledgeBase assert entity_linker.kb.entity_vector_length == 342 assert entity_linker.kb.custom_field == 666 # Make sure the custom KB is serialized correctly with make_tempdir() as tmp_dir: nlp.to_disk(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir) entity_linker2 = nlp2.get_pipe("entity_linker") # After IO, the KB is the standard one assert type(entity_linker2.kb) == KnowledgeBase assert entity_linker2.kb.entity_vector_length == 342 assert not hasattr(entity_linker2.kb, "custom_field")
def test_replace_listeners(): orig_config = Config().from_str(cfg_string) nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) text = "This is awesome" examples = [Example.from_dict(nlp.make_doc(text), {"tags": ["A", "B", "C"]})] optimizer = nlp.initialize(lambda: examples) # verify correct configuration with transformer listener transformer = nlp.get_pipe("transformer") tagger = nlp.get_pipe("tagger") tagger_tok2vec = tagger.model.get_ref("tok2vec") tagger_listener = tagger_tok2vec.get_ref("listener") assert isinstance(tagger_listener, TransformerListener) assert transformer.listener_map["tagger"][0] == tagger_listener assert isinstance(transformer.model, TransformerModel) assert ( nlp.config["components"]["transformer"]["model"]["@architectures"] == "spacy-transformers.TransformerModel.v3" ) assert ( nlp.config["components"]["tagger"]["model"]["tok2vec"]["@architectures"] == "spacy-transformers.TransformerListener.v1" ) # train pipe before replacing listeners for i in range(2): losses = {} nlp.update(examples, sgd=optimizer, losses=losses) doc = nlp(text) preds = [t.tag_ for t in doc] doc_tensor = tagger_tok2vec.predict([doc]) # replace listener and verify predictions are still the same nlp.replace_listeners("transformer", "tagger", ["model.tok2vec"]) tagger = nlp.get_pipe("tagger") tagger_tok2vec = tagger.model.get_ref("tok2vec") assert isinstance(tagger_tok2vec, Model) assert tagger_tok2vec.layers[0].layers[0].name == "transformer" assert ( nlp.config["components"]["tagger"]["model"]["tok2vec"]["@architectures"] == "spacy-transformers.Tok2VecTransformer.v3" ) doc2 = nlp(text) assert preds == [t.tag_ for t in doc2] pred_tensor = tagger_tok2vec.predict([doc2]) _assert_equal_tensors(doc_tensor, pred_tensor) # attempt training with the new pipeline optimizer = nlp.resume_training() for i in range(2): losses = {} nlp.update(examples, sgd=optimizer, losses=losses) assert losses["tagger"] > 0.0 # check for presence of additional fields in model_output assert doc2._.trf_data.model_output.pooler_output is not None assert doc2._.trf_data.model_output.attentions is not None # ensure IO goes OK doc_tensor_trained = tagger_tok2vec.predict([doc]) with make_tempdir() as d: file_path = d / "trained_nlp" nlp.to_disk(file_path) nlp2 = util.load_model_from_path(file_path) doc3 = nlp2(text) tagger2 = nlp2.get_pipe("tagger") tagger_tok2vec2 = tagger2.model.get_ref("tok2vec") pred_tensor = tagger_tok2vec2.predict([doc3]) _assert_equal_tensors(doc_tensor_trained, pred_tensor)
def test_overfitting_IO(): # Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly nlp = English() vector_length = 3 assert "Q2146908" not in nlp.vocab.strings # Convert the texts to docs to make sure we have doc.ents set for the training examples train_examples = [] for text, annotation in TRAIN_DATA: doc = nlp(text) train_examples.append(Example.from_dict(doc, annotation)) def create_kb(vocab): # create artificial KB - assign same prior weight to the two russ cochran's # Q2146908 (Russ Cochran): American golfer # Q7381115 (Russ Cochran): publisher mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7]) mykb.add_alias( alias="Russ Cochran", entities=["Q2146908", "Q7381115"], probabilities=[0.5, 0.5], ) return mykb # Create the Entity Linker component and add it to the pipeline entity_linker = nlp.add_pipe("entity_linker", last=True) entity_linker.set_kb(create_kb) assert "Q2146908" in entity_linker.vocab.strings assert "Q2146908" in entity_linker.kb.vocab.strings # train the NEL pipe optimizer = nlp.initialize(get_examples=lambda: train_examples) assert entity_linker.model.get_dim("nO") == vector_length assert entity_linker.model.get_dim( "nO") == entity_linker.kb.entity_vector_length for i in range(50): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses["entity_linker"] < 0.001 # adding additional components that are required for the entity_linker nlp.add_pipe("sentencizer", first=True) # Add a custom component to recognize "Russ Cochran" as an entity for the example training data patterns = [{ "label": "PERSON", "pattern": [{ "LOWER": "russ" }, { "LOWER": "cochran" }] }] ruler = nlp.add_pipe("entity_ruler", before="entity_linker") ruler.add_patterns(patterns) # test the trained model predictions = [] for text, annotation in TRAIN_DATA: doc = nlp(text) for ent in doc.ents: predictions.append(ent.kb_id_) assert predictions == GOLD_entities # Also test the results are still the same after IO with make_tempdir() as tmp_dir: nlp.to_disk(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir) assert nlp2.pipe_names == nlp.pipe_names assert "Q2146908" in nlp2.vocab.strings entity_linker2 = nlp2.get_pipe("entity_linker") assert "Q2146908" in entity_linker2.vocab.strings assert "Q2146908" in entity_linker2.kb.vocab.strings predictions = [] for text, annotation in TRAIN_DATA: doc2 = nlp2(text) for ent in doc2.ents: predictions.append(ent.kb_id_) assert predictions == GOLD_entities # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions texts = [ "Russ Cochran captured his first major title with his son as caddie.", "Russ Cochran his reprints include EC Comics.", "Russ Cochran has been publishing comic art.", "Russ Cochran was a member of University of Kentucky's golf team.", ] batch_deps_1 = [doc.to_array([ENT_KB_ID]) for doc in nlp.pipe(texts)] batch_deps_2 = [doc.to_array([ENT_KB_ID]) for doc in nlp.pipe(texts)] no_batch_deps = [ doc.to_array([ENT_KB_ID]) for doc in [nlp(text) for text in texts] ] assert_equal(batch_deps_1, batch_deps_2) assert_equal(batch_deps_1, no_batch_deps)
def test_beam_overfitting_IO(): # Simple test to try and quickly overfit the Beam dependency parser nlp = English() beam_width = 16 beam_density = 0.0001 config = { "beam_width": beam_width, "beam_density": beam_density, } parser = nlp.add_pipe("beam_parser", config=config) train_examples = [] for text, annotations in TRAIN_DATA: train_examples.append( Example.from_dict(nlp.make_doc(text), annotations)) for dep in annotations.get("deps", []): parser.add_label(dep) optimizer = nlp.initialize() # run overfitting for i in range(150): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses["beam_parser"] < 0.0001 # test the scores from the beam test_text = "I like securities." docs = [nlp.make_doc(test_text)] beams = parser.predict(docs) head_scores, label_scores = parser.scored_parses(beams) # we only processed one document head_scores = head_scores[0] label_scores = label_scores[0] # test label annotations: 0=nsubj, 2=dobj, 3=punct assert label_scores[(0, "nsubj")] == pytest.approx(1.0, abs=eps) assert label_scores[(0, "dobj")] == pytest.approx(0.0, abs=eps) assert label_scores[(0, "punct")] == pytest.approx(0.0, abs=eps) assert label_scores[(2, "nsubj")] == pytest.approx(0.0, abs=eps) assert label_scores[(2, "dobj")] == pytest.approx(1.0, abs=eps) assert label_scores[(2, "punct")] == pytest.approx(0.0, abs=eps) assert label_scores[(3, "nsubj")] == pytest.approx(0.0, abs=eps) assert label_scores[(3, "dobj")] == pytest.approx(0.0, abs=eps) assert label_scores[(3, "punct")] == pytest.approx(1.0, abs=eps) # test head annotations: the root is token at index 1 assert head_scores[(0, 0)] == pytest.approx(0.0, abs=eps) assert head_scores[(0, 1)] == pytest.approx(1.0, abs=eps) assert head_scores[(0, 2)] == pytest.approx(0.0, abs=eps) assert head_scores[(2, 0)] == pytest.approx(0.0, abs=eps) assert head_scores[(2, 1)] == pytest.approx(1.0, abs=eps) assert head_scores[(2, 2)] == pytest.approx(0.0, abs=eps) assert head_scores[(3, 0)] == pytest.approx(0.0, abs=eps) assert head_scores[(3, 1)] == pytest.approx(1.0, abs=eps) assert head_scores[(3, 2)] == pytest.approx(0.0, abs=eps) # Also test the results are still the same after IO with make_tempdir() as tmp_dir: nlp.to_disk(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir) docs2 = [nlp2.make_doc(test_text)] parser2 = nlp2.get_pipe("beam_parser") beams2 = parser2.predict(docs2) head_scores2, label_scores2 = parser2.scored_parses(beams2) # we only processed one document head_scores2 = head_scores2[0] label_scores2 = label_scores2[0] # check the results again assert label_scores2[(0, "nsubj")] == pytest.approx(1.0, abs=eps) assert label_scores2[(0, "dobj")] == pytest.approx(0.0, abs=eps) assert label_scores2[(0, "punct")] == pytest.approx(0.0, abs=eps) assert label_scores2[(2, "nsubj")] == pytest.approx(0.0, abs=eps) assert label_scores2[(2, "dobj")] == pytest.approx(1.0, abs=eps) assert label_scores2[(2, "punct")] == pytest.approx(0.0, abs=eps) assert label_scores2[(3, "nsubj")] == pytest.approx(0.0, abs=eps) assert label_scores2[(3, "dobj")] == pytest.approx(0.0, abs=eps) assert label_scores2[(3, "punct")] == pytest.approx(1.0, abs=eps) assert head_scores2[(0, 0)] == pytest.approx(0.0, abs=eps) assert head_scores2[(0, 1)] == pytest.approx(1.0, abs=eps) assert head_scores2[(0, 2)] == pytest.approx(0.0, abs=eps) assert head_scores2[(2, 0)] == pytest.approx(0.0, abs=eps) assert head_scores2[(2, 1)] == pytest.approx(1.0, abs=eps) assert head_scores2[(2, 2)] == pytest.approx(0.0, abs=eps) assert head_scores2[(3, 0)] == pytest.approx(0.0, abs=eps) assert head_scores2[(3, 1)] == pytest.approx(1.0, abs=eps) assert head_scores2[(3, 2)] == pytest.approx(0.0, abs=eps)
def custom_train( lang, output_path, train_path, dev_path, raw_text=None, base_model=None, pipeline="tagger,parser,ner", vectors=None, n_iter=30, n_early_stopping=None, n_examples=0, use_gpu=-1, version="0.0.0", meta_path=None, init_tok2vec=None, parser_multitasks="", entity_multitasks="", noise_level=0.0, orth_variant_level=0.0, eval_beam_widths="", gold_preproc=False, learn_tokens=False, textcat_multilabel=False, textcat_arch="bow", textcat_positive_label=None, verbose=False, debug=False, ): """ Train or update a spaCy model. Requires data to be formatted in spaCy's JSON format. To convert data from other formats, use the `spacy convert` command. """ # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200 import tqdm msg = Printer() util.fix_random_seed() util.set_env_log(verbose) # Make sure all files and paths exists if they are needed train_path = util.ensure_path(train_path) dev_path = util.ensure_path(dev_path) meta_path = util.ensure_path(meta_path) output_path = util.ensure_path(output_path) if raw_text is not None: raw_text = list(srsly.read_jsonl(raw_text)) if not train_path or not train_path.exists(): msg.fail("Training data not found", train_path, exits=1) if not dev_path or not dev_path.exists(): msg.fail("Development data not found", dev_path, exits=1) if meta_path is not None and not meta_path.exists(): msg.fail("Can't find model meta.json", meta_path, exits=1) meta = srsly.read_json(meta_path) if meta_path else {} if output_path.exists() and [ p for p in output_path.iterdir() if p.is_dir() ]: msg.warn( "Output directory is not empty", "This can lead to unintended side effects when saving the model. " "Please use an empty directory or a different path instead. If " "the specified output path doesn't exist, the directory will be " "created for you.", ) if not output_path.exists(): output_path.mkdir() # Take dropout and batch size as generators of values -- dropout # starts high and decays sharply, to force the optimizer to explore. # Batch size starts at 1 and grows, so that we make updates quickly # at the beginning of training. dropout_rates = util.decaying( util.env_opt("dropout_from", 0.2), util.env_opt("dropout_to", 0.2), util.env_opt("dropout_decay", 0.0), ) batch_sizes = util.compounding( util.env_opt("batch_from", 100.0), util.env_opt("batch_to", 1000.0), util.env_opt("batch_compound", 1.001), ) if not eval_beam_widths: eval_beam_widths = [1] else: eval_beam_widths = [int(bw) for bw in eval_beam_widths.split(",")] if 1 not in eval_beam_widths: eval_beam_widths.append(1) eval_beam_widths.sort() has_beam_widths = eval_beam_widths != [1] # Set up the base model and pipeline. If a base model is specified, load # the model and make sure the pipeline matches the pipeline setting. If # training starts from a blank model, intitalize the language class. pipeline = [p.strip() for p in pipeline.split(",")] msg.text("Training pipeline: {}".format(pipeline)) if base_model: msg.text("Starting with base model '{}'".format(base_model)) nlp = util.load_model(base_model) if nlp.lang != lang: msg.fail( "Model language ('{}') doesn't match language specified as " "`lang` argument ('{}') ".format(nlp.lang, lang), exits=1, ) nlp.disable_pipes([p for p in nlp.pipe_names if p not in pipeline]) for pipe in pipeline: if pipe not in nlp.pipe_names: if pipe == "parser": pipe_cfg = {"learn_tokens": learn_tokens} elif pipe == "textcat": pipe_cfg = { "exclusive_classes": not textcat_multilabel, "architecture": textcat_arch, "positive_label": textcat_positive_label, } else: pipe_cfg = {} nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg)) else: if pipe == "textcat": textcat_cfg = nlp.get_pipe("textcat").cfg base_cfg = { "exclusive_classes": textcat_cfg["exclusive_classes"], "architecture": textcat_cfg["architecture"], "positive_label": textcat_cfg["positive_label"], } pipe_cfg = { "exclusive_classes": not textcat_multilabel, "architecture": textcat_arch, "positive_label": textcat_positive_label, } if base_cfg != pipe_cfg: msg.fail( "The base textcat model configuration does" "not match the provided training options. " "Existing cfg: {}, provided cfg: {}".format( base_cfg, pipe_cfg), exits=1, ) else: msg.text("Starting with blank model '{}'".format(lang)) lang_cls = util.get_lang_class(lang) ### Here are our modifications: lang_cls.Defaults.tag_map = custom_tag_map nlp = lang_cls() assert nlp.vocab.morphology.n_tags == 36 ### for pipe in pipeline: if pipe == "parser": pipe_cfg = {"learn_tokens": learn_tokens} elif pipe == "textcat": pipe_cfg = { "exclusive_classes": not textcat_multilabel, "architecture": textcat_arch, "positive_label": textcat_positive_label, } else: pipe_cfg = {} nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg)) if vectors: msg.text("Loading vector from model '{}'".format(vectors)) _load_vectors(nlp, vectors) # Multitask objectives multitask_options = [("parser", parser_multitasks), ("ner", entity_multitasks)] for pipe_name, multitasks in multitask_options: if multitasks: if pipe_name not in pipeline: msg.fail("Can't use multitask objective without '{}' in the " "pipeline".format(pipe_name)) pipe = nlp.get_pipe(pipe_name) for objective in multitasks.split(","): pipe.add_multitask_objective(objective) # Prepare training corpus msg.text("Counting training words (limit={})".format(n_examples)) corpus = GoldCorpus(train_path, dev_path, limit=n_examples) n_train_words = corpus.count_train() if base_model: # Start with an existing model, use default optimizer optimizer = create_default_optimizer(Model.ops) else: # Start with a blank model, call begin_training optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu) nlp._optimizer = None # Load in pretrained weights if init_tok2vec is not None: components = _load_pretrained_tok2vec(nlp, init_tok2vec) msg.text("Loaded pretrained tok2vec for: {}".format(components)) # Verify textcat config if "textcat" in pipeline: textcat_labels = nlp.get_pipe("textcat").cfg["labels"] if textcat_positive_label and textcat_positive_label not in textcat_labels: msg.fail( "The textcat_positive_label (tpl) '{}' does not match any " "label in the training data.".format(textcat_positive_label), exits=1, ) if textcat_positive_label and len(textcat_labels) != 2: msg.fail( "A textcat_positive_label (tpl) '{}' was provided for training " "data that does not appear to be a binary classification " "problem with two labels.".format(textcat_positive_label), exits=1, ) train_docs = corpus.train_docs( nlp, noise_level=noise_level, gold_preproc=gold_preproc, max_length=0, ignore_misaligned=True, ) train_labels = set() if textcat_multilabel: multilabel_found = False for text, gold in train_docs: train_labels.update(gold.cats.keys()) if list(gold.cats.values()).count(1.0) != 1: multilabel_found = True if not multilabel_found and not base_model: msg.warn("The textcat training instances look like they have " "mutually-exclusive classes. Remove the flag " "'--textcat-multilabel' to train a classifier with " "mutually-exclusive classes.") if not textcat_multilabel: for text, gold in train_docs: train_labels.update(gold.cats.keys()) if list(gold.cats.values()).count(1.0) != 1 and not base_model: msg.warn( "Some textcat training instances do not have exactly " "one positive label. Modifying training options to " "include the flag '--textcat-multilabel' for classes " "that are not mutually exclusive.") nlp.get_pipe("textcat").cfg["exclusive_classes"] = False textcat_multilabel = True break if base_model and set(textcat_labels) != train_labels: msg.fail( "Cannot extend textcat model using data with different " "labels. Base model labels: {}, training data labels: " "{}.".format(textcat_labels, list(train_labels)), exits=1, ) if textcat_multilabel: msg.text( "Textcat evaluation score: ROC AUC score macro-averaged across " "the labels '{}'".format(", ".join(textcat_labels))) elif textcat_positive_label and len(textcat_labels) == 2: msg.text("Textcat evaluation score: F1-score for the " "label '{}'".format(textcat_positive_label)) elif len(textcat_labels) > 1: if len(textcat_labels) == 2: msg.warn( "If the textcat component is a binary classifier with " "exclusive classes, provide '--textcat_positive_label' for " "an evaluation on the positive class.") msg.text( "Textcat evaluation score: F1-score macro-averaged across " "the labels '{}'".format(", ".join(textcat_labels))) else: msg.fail( "Unsupported textcat configuration. Use `spacy debug-data` " "for more information.") # fmt: off row_head, output_stats = _configure_training_output( pipeline, use_gpu, has_beam_widths) row_widths = [len(w) for w in row_head] row_settings = { "widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2 } # fmt: on print("") msg.row(row_head, **row_settings) msg.row(["-" * width for width in row_settings["widths"]], **row_settings) try: iter_since_best = 0 best_score = 0.0 for i in range(n_iter): train_docs = corpus.train_docs( nlp, noise_level=noise_level, orth_variant_level=orth_variant_level, gold_preproc=gold_preproc, max_length=0, ignore_misaligned=True, ) if raw_text: random.shuffle(raw_text) raw_batches = util.minibatch( (nlp.make_doc(rt["text"]) for rt in raw_text), size=8) words_seen = 0 with tqdm.tqdm(total=n_train_words, leave=False) as pbar: losses = {} for batch in util.minibatch_by_words(train_docs, size=batch_sizes): if not batch: continue docs, golds = zip(*batch) nlp.update( docs, golds, sgd=optimizer, drop=next(dropout_rates), losses=losses, ) if raw_text: # If raw text is available, perform 'rehearsal' updates, # which use unlabelled data to reduce overfitting. raw_batch = list(next(raw_batches)) nlp.rehearse(raw_batch, sgd=optimizer, losses=losses) if not int(os.environ.get("LOG_FRIENDLY", 0)): pbar.update(sum(len(doc) for doc in docs)) words_seen += sum(len(doc) for doc in docs) with nlp.use_params(optimizer.averages): util.set_env_log(False) epoch_model_path = output_path / ("model%d" % i) nlp.to_disk(epoch_model_path) nlp_loaded = util.load_model_from_path(epoch_model_path) for beam_width in eval_beam_widths: for name, component in nlp_loaded.pipeline: if hasattr(component, "cfg"): component.cfg["beam_width"] = beam_width dev_docs = list( corpus.dev_docs( nlp_loaded, gold_preproc=gold_preproc, ignore_misaligned=True, )) nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose) end_time = timer() if use_gpu < 0: gpu_wps = None cpu_wps = nwords / (end_time - start_time) else: gpu_wps = nwords / (end_time - start_time) with Model.use_device("cpu"): nlp_loaded = util.load_model_from_path( epoch_model_path) for name, component in nlp_loaded.pipeline: if hasattr(component, "cfg"): component.cfg["beam_width"] = beam_width dev_docs = list( corpus.dev_docs( nlp_loaded, gold_preproc=gold_preproc, ignore_misaligned=True, )) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose) end_time = timer() cpu_wps = nwords / (end_time - start_time) acc_loc = output_path / ("model%d" % i) / "accuracy.json" srsly.write_json(acc_loc, scorer.scores) # Update model meta.json meta["lang"] = nlp.lang meta["pipeline"] = nlp.pipe_names meta["spacy_version"] = ">=%s" % about.__version__ if beam_width == 1: meta["speed"] = { "nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps, } meta["accuracy"] = scorer.scores else: meta.setdefault("beam_accuracy", {}) meta.setdefault("beam_speed", {}) meta["beam_accuracy"][beam_width] = scorer.scores meta["beam_speed"][beam_width] = { "nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps, } meta["vectors"] = { "width": nlp.vocab.vectors_length, "vectors": len(nlp.vocab.vectors), "keys": nlp.vocab.vectors.n_keys, "name": nlp.vocab.vectors.name, } meta.setdefault("name", "model%d" % i) meta.setdefault("version", version) meta["labels"] = nlp.meta["labels"] meta_loc = output_path / ("model%d" % i) / "meta.json" srsly.write_json(meta_loc, meta) util.set_env_log(verbose) progress = _get_progress( i, losses, scorer.scores, output_stats, beam_width=beam_width if has_beam_widths else None, cpu_wps=cpu_wps, gpu_wps=gpu_wps, ) if i == 0 and "textcat" in pipeline: textcats_per_cat = scorer.scores.get( "textcats_per_cat", {}) for cat, cat_score in textcats_per_cat.items(): if cat_score.get("roc_auc_score", 0) < 0: msg.warn( "Textcat ROC AUC score is undefined due to " "only one value in label '{}'.".format( cat)) msg.row(progress, **row_settings) # Early stopping if n_early_stopping is not None: current_score = _score_for_model(meta) if current_score < best_score: iter_since_best += 1 else: iter_since_best = 0 best_score = current_score if iter_since_best >= n_early_stopping: msg.text("Early stopping, best iteration " "is: {}".format(i - iter_since_best)) msg.text("Best score = {}; Final iteration " "score = {}".format(best_score, current_score)) break finally: with nlp.use_params(optimizer.averages): final_model_path = output_path / "model-final" nlp.to_disk(final_model_path) msg.good("Saved model to output directory", final_model_path) with msg.loading("Creating best model..."): best_model_path = _collate_best_model(meta, output_path, nlp.pipe_names) msg.good("Created best model", best_model_path)