def test_resize_same_results(name, textcat_config): # Ensure that the resized textcat classifiers still produce the same results for old labels fix_random_seed(0) nlp = English() pipe_config = {"model": textcat_config} textcat = nlp.add_pipe(name, config=pipe_config) train_examples = [] for text, annotations in TRAIN_DATA_SINGLE_LABEL: train_examples.append( Example.from_dict(nlp.make_doc(text), annotations)) optimizer = nlp.initialize(get_examples=lambda: train_examples) assert textcat.model.maybe_get_dim("nO") in [2, None] for i in range(5): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) # test the trained model before resizing test_text = "I am happy." doc = nlp(test_text) assert len(doc.cats) == 2 pos_pred = doc.cats["POSITIVE"] neg_pred = doc.cats["NEGATIVE"] # test the trained model again after resizing textcat.add_label("NEUTRAL") doc = nlp(test_text) assert len(doc.cats) == 3 assert doc.cats["POSITIVE"] == pos_pred assert doc.cats["NEGATIVE"] == neg_pred assert doc.cats["NEUTRAL"] <= 1 for i in range(5): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) # test the trained model again after training further with new label doc = nlp(test_text) assert len(doc.cats) == 3 assert doc.cats["POSITIVE"] != pos_pred assert doc.cats["NEGATIVE"] != neg_pred for cat in doc.cats: assert doc.cats[cat] <= 1
def test_get_span_characteristics_return_value(): nlp = English() spans_key = "sc" pred = Doc(nlp.vocab, words=["Welcome", "to", "the", "Bank", "of", "China", "."]) pred.spans[spans_key] = [Span(pred, 3, 6, "ORG"), Span(pred, 5, 6, "GPE")] ref = Doc(nlp.vocab, words=["Welcome", "to", "the", "Bank", "of", "China", "."]) ref.spans[spans_key] = [Span(ref, 3, 6, "ORG"), Span(ref, 5, 6, "GPE")] eg = Example(pred, ref) examples = [eg] data = _compile_gold(examples, ["spancat"], nlp, True) span_characteristics = _get_span_characteristics( examples=examples, compiled_gold=data, spans_key=spans_key ) assert {"sd", "bd", "lengths"}.issubset(span_characteristics.keys()) assert span_characteristics["min_length"] == 1 assert span_characteristics["max_length"] == 3
def test_textcat_multi_threshold(): # Ensure the scorer can be called with a different threshold nlp = English() nlp.add_pipe("textcat_multilabel") train_examples = [] for text, annotations in TRAIN_DATA_SINGLE_LABEL: train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) nlp.initialize(get_examples=lambda: train_examples) # score the model (it's not actually trained but that doesn't matter) scores = nlp.evaluate(train_examples) assert 0 <= scores["cats_score"] <= 1 scores = nlp.evaluate(train_examples, scorer_cfg={"threshold": 1.0}) assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 0 scores = nlp.evaluate(train_examples, scorer_cfg={"threshold": 0}) assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0
def test_split_sents(merged_dict): nlp = English() example = Example.from_dict( Doc(nlp.vocab, words=merged_dict["words"], spaces=merged_dict["spaces"]), merged_dict, ) assert example.text == "Hi there everyone It is just me" split_examples = example.split_sents() assert len(split_examples) == 2 assert split_examples[0].text == "Hi there everyone " assert split_examples[1].text == "It is just me" token_annotation_1 = split_examples[0].to_dict()["token_annotation"] assert token_annotation_1["ORTH"] == ["Hi", "there", "everyone"] assert token_annotation_1["TAG"] == ["INTJ", "ADV", "PRON"] assert token_annotation_1["SENT_START"] == [1, 0, 0] token_annotation_2 = split_examples[1].to_dict()["token_annotation"] assert token_annotation_2["ORTH"] == ["It", "is", "just", "me"] assert token_annotation_2["TAG"] == ["PRON", "AUX", "ADV", "PRON"] assert token_annotation_2["SENT_START"] == [1, 0, 0, 0]
def test_tok2vec_listener_callback(): orig_config = Config().from_str(cfg_string) nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) assert nlp.pipe_names == ["tok2vec", "tagger"] tagger = nlp.get_pipe("tagger") tok2vec = nlp.get_pipe("tok2vec") nlp._link_components() docs = [nlp.make_doc("A random sentence")] tok2vec.model.initialize(X=docs) gold_array = [[1.0 for tag in ["V", "Z"]] for word in docs] label_sample = [tagger.model.ops.asarray(gold_array, dtype="float32")] tagger.model.initialize(X=docs, Y=label_sample) docs = [nlp.make_doc("Another entirely random sentence")] tok2vec.update([Example.from_dict(x, {}) for x in docs]) Y, get_dX = tagger.model.begin_update(docs) # assure that the backprop call works (and doesn't hit a 'None' callback) assert get_dX(Y) is not None
def test_train_empty(): """Test that training an empty text does not throw errors.""" train_data = [ ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}), ("", {"entities": []}), ] nlp = English() train_examples = [] for t in train_data: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) ner = nlp.add_pipe("ner", last=True) ner.add_label("PERSON") nlp.initialize() for itn in range(2): losses = {} batches = util.minibatch(train_examples, size=8) for batch in batches: nlp.update(batch, losses=losses)
def test_train_negative_deprecated(): """Test that the deprecated negative entity format raises a custom error.""" train_data = [ ("Who is Shaka Khan?", {"entities": [(7, 17, "!PERSON")]}), ] nlp = English() train_examples = [] for t in train_data: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) ner = nlp.add_pipe("ner", last=True) ner.add_label("PERSON") nlp.initialize() for itn in range(2): losses = {} batches = util.minibatch(train_examples, size=8) for batch in batches: with pytest.raises(ValueError): nlp.update(batch, losses=losses)
def test_issue2800(): """Test issue that arises when too many labels are added to NER model. Used to cause segfault. """ nlp = English() train_data = [] train_data.extend( [Example.from_dict(nlp.make_doc("One sentence"), {"entities": []})] ) entity_types = [str(i) for i in range(1000)] ner = nlp.add_pipe("ner") for entity_type in list(entity_types): ner.add_label(entity_type) optimizer = nlp.initialize() for i in range(20): losses = {} random.shuffle(train_data) for example in train_data: nlp.update([example], sgd=optimizer, losses=losses, drop=0.5)
def test_issue4030(): """Test whether textcat works fine with empty doc""" unique_classes = ["offensive", "inoffensive"] x_train = [ "This is an offensive text", "This is the second offensive text", "inoff", ] y_train = ["offensive", "offensive", "inoffensive"] nlp = spacy.blank("en") # preparing the data train_data = [] for text, train_instance in zip(x_train, y_train): cat_dict = {label: label == train_instance for label in unique_classes} train_data.append( Example.from_dict(nlp.make_doc(text), {"cats": cat_dict})) # add a text categorizer component model = { "@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": False, } textcat = nlp.add_pipe("textcat", config={"model": model}, last=True) for label in unique_classes: textcat.add_label(label) # training the network with nlp.select_pipes(enable="textcat"): optimizer = nlp.initialize() for i in range(3): losses = {} batches = util.minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) for batch in batches: nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses) # processing of an empty doc should result in 0.0 for all categories doc = nlp("") assert doc.cats["offensive"] == 0.0 assert doc.cats["inoffensive"] == 0.0
def test_initialize_examples(): nlp = Language() tagger = nlp.add_pipe("tagger") train_examples = [] for tag in TAGS: tagger.add_label(tag) for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) # you shouldn't really call this more than once, but for testing it should be fine nlp.initialize() nlp.initialize(get_examples=lambda: train_examples) with pytest.raises(TypeError): nlp.initialize(get_examples=lambda: None) with pytest.raises(TypeError): nlp.initialize(get_examples=lambda: train_examples[0]) with pytest.raises(TypeError): nlp.initialize(get_examples=lambda: []) with pytest.raises(TypeError): nlp.initialize(get_examples=train_examples)
def test_incomplete_data(): # Test that the lemmatizer works with incomplete information nlp = English() lemmatizer = nlp.add_pipe("trainable_lemmatizer") lemmatizer.min_tree_freq = 1 train_examples = [] for t in PARTIAL_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) optimizer = nlp.initialize(get_examples=lambda: train_examples) for i in range(50): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses["trainable_lemmatizer"] < 0.00001 # test the trained model test_text = "She likes blue eggs" doc = nlp(test_text) assert doc[1].lemma_ == "like" assert doc[2].lemma_ == "blue"
def test_tokenization(sented_doc): scorer = Scorer() gold = {"sent_starts": [t.sent_start for t in sented_doc]} example = Example.from_dict(sented_doc, gold) scores = scorer.score([example]) assert scores["token_acc"] == 1.0 nlp = English() example.predicted = Doc( nlp.vocab, words=["One", "sentence.", "Two", "sentences.", "Three", "sentences."], spaces=[True, True, True, True, True, False], ) example.predicted[1].is_sent_start = False scores = scorer.score([example]) assert scores["token_acc"] == approx(0.66666666) assert scores["token_p"] == 0.5 assert scores["token_r"] == approx(0.33333333) assert scores["token_f"] == 0.4
def test_annotates_on_update(): # The custom component checks for sentence annotation @Language.factory("assert_sents", default_config={}) def assert_sents(nlp, name): return AssertSents(name) class AssertSents: def __init__(self, name, **cfg): self.name = name pass def __call__(self, doc): if not doc.has_annotation("SENT_START"): raise ValueError("No sents") return doc def update(self, examples, *, drop=0.0, sgd=None, losses=None): for example in examples: if not example.predicted.has_annotation("SENT_START"): raise ValueError("No sents") return {} nlp = English() nlp.add_pipe("sentencizer") nlp.add_pipe("assert_sents") # When the pipeline runs, annotations are set nlp("This is a sentence.") examples = [] for text in ["a a", "b b", "c c"]: examples.append(Example(nlp.make_doc(text), nlp(text))) for example in examples: assert not example.predicted.has_annotation("SENT_START") # If updating without setting annotations, assert_sents will raise an error with pytest.raises(ValueError): nlp.update(examples) # Updating while setting annotations for the sentencizer succeeds nlp.update(examples, annotates=["sentencizer"])
def test_beam_valid_parse(neg_key): """Regression test for previously flakey behaviour""" nlp = English() beam_width = 16 beam_density = 0.0001 config = { "beam_width": beam_width, "beam_density": beam_density, "incorrect_spans_key": neg_key, } nlp.add_pipe("beam_ner", config=config) # fmt: off tokens = [ 'FEDERAL', 'NATIONAL', 'MORTGAGE', 'ASSOCIATION', '(', 'Fannie', 'Mae', '):', 'Posted', 'yields', 'on', '30', 'year', 'mortgage', 'commitments', 'for', 'delivery', 'within', '30', 'days', '(', 'priced', 'at', 'par', ')', '9.75', '%', ',', 'standard', 'conventional', 'fixed', '-', 'rate', 'mortgages', ';', '8.70', '%', ',', '6/2', 'rate', 'capped', 'one', '-', 'year', 'adjustable', 'rate', 'mortgages', '.', 'Source', ':', 'Telerate', 'Systems', 'Inc.' ] iob = [ 'B-ORG', 'I-ORG', 'I-ORG', 'L-ORG', 'O', 'B-ORG', 'L-ORG', 'O', 'O', 'O', 'O', 'B-DATE', 'L-DATE', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'L-DATE', 'O', 'O', 'O', 'O', 'O', 'B-PERCENT', 'L-PERCENT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PERCENT', 'L-PERCENT', 'O', 'U-CARDINAL', 'O', 'O', 'B-DATE', 'I-DATE', 'L-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O' ] # fmt: on doc = Doc(nlp.vocab, words=tokens) example = Example.from_dict(doc, {"ner": iob}) neg_span = Span(doc, 50, 53, "ORG") example.reference.spans[neg_key] = [neg_span] optimizer = nlp.initialize() for i in range(5): losses = {} nlp.update([example], sgd=optimizer, losses=losses) assert "beam_ner" in losses
def test_beam_overfitting_IO(): # Simple test to try and quickly overfit the Beam NER component nlp = English() beam_width = 16 beam_density = 0.0001 config = { "beam_width": beam_width, "beam_density": beam_density, } ner = nlp.add_pipe("beam_ner", config=config) train_examples = [] for text, annotations in TRAIN_DATA: train_examples.append( Example.from_dict(nlp.make_doc(text), annotations)) for ent in annotations.get("entities"): ner.add_label(ent[2]) optimizer = nlp.initialize() # run overfitting for i in range(50): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses["beam_ner"] < 0.0001 # test the scores from the beam test_text = "I like London." docs = [nlp.make_doc(test_text)] beams = ner.predict(docs) entity_scores = ner.scored_ents(beams)[0] assert entity_scores[(2, 3, "LOC")] == 1.0 assert entity_scores[(2, 3, "PERSON")] == 0.0 # Also test the results are still the same after IO with make_tempdir() as tmp_dir: nlp.to_disk(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir) docs2 = [nlp2.make_doc(test_text)] ner2 = nlp2.get_pipe("beam_ner") beams2 = ner2.predict(docs2) entity_scores2 = ner2.scored_ents(beams2)[0] assert entity_scores2[(2, 3, "LOC")] == 1.0 assert entity_scores2[(2, 3, "PERSON")] == 0.0
def test_update_with_annotates(): name = "test_with_annotates" results = {} def make_component(name): results[name] = "" def component(doc): nonlocal results results[name] += doc.text return doc return component Language.component(f"{name}1", func=make_component(f"{name}1")) Language.component(f"{name}2", func=make_component(f"{name}2")) components = set([f"{name}1", f"{name}2"]) nlp = English() texts = ["a", "bb", "ccc"] examples = [] for text in texts: examples.append(Example(nlp.make_doc(text), nlp.make_doc(text))) for components_to_annotate in [ [], [f"{name}1"], [f"{name}1", f"{name}2"], [f"{name}2", f"{name}1"], ]: for key in results: results[key] = "" nlp = English(vocab=nlp.vocab) nlp.add_pipe(f"{name}1") nlp.add_pipe(f"{name}2") nlp.update(examples, annotates=components_to_annotate) for component in components_to_annotate: assert results[component] == "".join(eg.predicted.text for eg in examples) for component in components - set(components_to_annotate): assert results[component] == ""
async def train(self, sources: Sources): train_examples = await self._preprocess_data(sources) for _, entities in train_examples: for ent in entities.get("entities"): self.ner.add_label(ent[2]) # get names of other pipes to disable them during training pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"] other_pipes = [ pipe for pipe in self.nlp.pipe_names if pipe not in pipe_exceptions ] # only train NER with self.nlp.disable_pipes(*other_pipes), warnings.catch_warnings(): # show warnings for misaligned entity spans once warnings.filterwarnings("once", category=UserWarning, module="spacy") if self.parent.config.model_name_or_path is None: self.nlp.begin_training() for itn in range(self.parent.config.n_iter): random.shuffle(train_examples) losses = {} batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001)) for batch in batches: examples = [] for doc, gold_dict in batch: doc = self.nlp.make_doc(doc) examples.append(Example.from_dict(doc, gold_dict)) self.nlp.update( examples, drop=self.parent.config.dropout, losses=losses, ) self.logger.debug(f"Losses: {losses}") if self.parent.config.directory is not None: if not self.parent.config.directory.exists(): self.parent.config.directory.mkdir(parents=True) self.nlp.to_disk(self.parent.config.directory) self.logger.debug( f"Saved model to {self.parent.config.directory.name}")
def parser(vocab): vocab.strings.add("ROOT") cfg = {"model": DEFAULT_PARSER_MODEL} model = registry.resolve(cfg, validate=True)["model"] parser = DependencyParser(vocab, model) parser.cfg["token_vector_width"] = 4 parser.cfg["hidden_width"] = 32 # parser.add_label('right') parser.add_label("left") parser.initialize(lambda: [_parser_example(parser)]) sgd = Adam(0.001) for i in range(10): losses = {} doc = Doc(vocab, words=["a", "b", "c", "d"]) example = Example.from_dict( doc, {"heads": [1, 1, 3, 3], "deps": ["left", "ROOT", "left", "ROOT"]} ) parser.update([example], sgd=sgd, losses=losses) return parser
def test_negative_samples_two_word_input(tsys, vocab, neg_key): """Test that we don't get stuck in a two word input when we have a negative span. This could happen if we don't have the right check on the B action. """ tsys.cfg["neg_key"] = neg_key doc = Doc(vocab, words=["A", "B"]) entity_annots = [None, None] example = Example.from_dict(doc, {"entities": entity_annots}) # These mean that the oracle sequence shouldn't have O for the first # word, and it shouldn't analyse it as B-PERSON, L-PERSON example.y.spans[neg_key] = [ Span(example.y, 0, 1, label="O"), Span(example.y, 0, 2, label="PERSON"), ] act_classes = tsys.get_oracle_sequence(example) names = [tsys.get_class_name(act) for act in act_classes] assert names assert names[0] != "O" assert names[0] != "B-PERSON" assert names[1] != "L-PERSON"
def test_replace_listeners_from_config(): orig_config = Config().from_str(cfg_string_multi) nlp = util.load_model_from_config(orig_config, auto_fill=True) annots = {"tags": ["V", "Z"], "entities": [(0, 1, "A"), (1, 2, "B")]} examples = [Example.from_dict(nlp.make_doc("x y"), annots)] nlp.initialize(lambda: examples) tok2vec = nlp.get_pipe("tok2vec") tagger = nlp.get_pipe("tagger") ner = nlp.get_pipe("ner") assert tok2vec.listening_components == ["tagger", "ner"] assert any(isinstance(node, Tok2VecListener) for node in ner.model.walk()) assert any(isinstance(node, Tok2VecListener) for node in tagger.model.walk()) with make_tempdir() as dir_path: nlp.to_disk(dir_path) base_model = str(dir_path) new_config = { "nlp": {"lang": "en", "pipeline": ["tok2vec", "tagger", "ner"]}, "components": { "tok2vec": {"source": base_model}, "tagger": { "source": base_model, "replace_listeners": ["model.tok2vec"], }, "ner": {"source": base_model}, }, } new_nlp = util.load_model_from_config(new_config, auto_fill=True) new_nlp.initialize(lambda: examples) tok2vec = new_nlp.get_pipe("tok2vec") tagger = new_nlp.get_pipe("tagger") ner = new_nlp.get_pipe("ner") assert tok2vec.listening_components == ["ner"] assert any(isinstance(node, Tok2VecListener) for node in ner.model.walk()) assert not any(isinstance(node, Tok2VecListener) for node in tagger.model.walk()) t2v_cfg = new_nlp.config["components"]["tok2vec"]["model"] assert t2v_cfg["@architectures"] == "spacy.Tok2Vec.v2" assert new_nlp.config["components"]["tagger"]["model"]["tok2vec"] == t2v_cfg assert ( new_nlp.config["components"]["ner"]["model"]["tok2vec"]["@architectures"] == "spacy.Tok2VecListener.v1" )
def test_oracle_moves_whitespace(en_vocab): words = [ "production", "\n", "of", "Northrop", "\n", "Corp.", "\n", "'s", "radar" ] biluo_tags = ["O", "O", "O", "B-ORG", None, "I-ORG", "L-ORG", "O", "O"] doc = Doc(en_vocab, words=words) example = Example.from_dict(doc, {"entities": biluo_tags}) moves = BiluoPushDown(en_vocab.strings) move_types = ("M", "B", "I", "L", "U", "O") for tag in biluo_tags: if tag is None: continue elif tag == "O": moves.add_action(move_types.index("O"), "") else: action, label = tag.split("-") moves.add_action(move_types.index(action), label) moves.get_oracle_sequence(example)
def test_oracle_moves_missing_B(en_vocab): words = ["B", "52", "Bomber"] biluo_tags = [None, None, "L-PRODUCT"] doc = Doc(en_vocab, words=words) example = Example.from_dict(doc, {"words": words, "entities": biluo_tags}) moves = BiluoPushDown(en_vocab.strings) move_types = ("M", "B", "I", "L", "U", "O") for tag in biluo_tags: if tag is None: continue elif tag == "O": moves.add_action(move_types.index("O"), "") else: action, label = tag.split("-") moves.add_action(move_types.index("B"), label) moves.add_action(move_types.index("I"), label) moves.add_action(move_types.index("L"), label) moves.add_action(move_types.index("U"), label) moves.get_oracle_sequence(example)
def test_model_config_inline(model): nlp = spacy.load("en_core_web_sm") conf = {"sklearn_model": model, "label": "pos", "classes": ["pos", "neg"]} nlp.add_pipe("sklearn-cat", config=conf) texts = [ "you are a nice person", "this is a great movie", "i do not like coffee" ] labels = ["pos", "pos", "neg"] with nlp.select_pipes(enable="sklearn-cat"): optimizer = nlp.resume_training() for itn in range(100): for t, lab in zip(texts, labels): doc = nlp.make_doc(t) example = Example.from_dict(doc, {"cats": {"pos": lab}}) nlp.update([example], sgd=optimizer) assert len(nlp("you are a nice person").cats.keys()) > 0 assert len(nlp("coffee i do not like").cats.keys()) > 0
def test_debug_data_compile_gold_for_spans(): nlp = English() spans_key = "sc" pred = Doc(nlp.vocab, words=["Welcome", "to", "the", "Bank", "of", "China", "."]) pred.spans[spans_key] = [Span(pred, 3, 6, "ORG"), Span(pred, 5, 6, "GPE")] ref = Doc(nlp.vocab, words=["Welcome", "to", "the", "Bank", "of", "China", "."]) ref.spans[spans_key] = [Span(ref, 3, 6, "ORG"), Span(ref, 5, 6, "GPE")] eg = Example(pred, ref) data = _compile_gold([eg], ["spancat"], nlp, True) assert data["spancat"][spans_key] == Counter({"ORG": 1, "GPE": 1}) assert data["spans_length"][spans_key] == {"ORG": [3], "GPE": [1]} assert data["spans_per_type"][spans_key] == { "ORG": [Span(ref, 3, 6, "ORG")], "GPE": [Span(ref, 5, 6, "GPE")], } assert data["sb_per_type"][spans_key] == { "ORG": {"start": [ref[2:3]], "end": [ref[6:7]]}, "GPE": {"start": [ref[4:5]], "end": [ref[6:7]]}, }
def spacy_model_with_data(): # Creating blank model and setting up the spaCy pipeline nlp = spacy.blank("en") if IS_SPACY_VERSION_NEWER_THAN_OR_EQUAL_TO_3_0_0: from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL model = { "@architectures": "spacy.TextCatCNN.v1", "exclusive_classes": True, "tok2vec": DEFAULT_TOK2VEC_MODEL, } textcat = nlp.add_pipe("textcat", config={"model": model}, last=True) else: textcat = nlp.create_pipe("textcat", config={ "exclusive_classes": True, "architecture": "simple_cnn" }) nlp.add_pipe(textcat, last=True) # Training the model to recognize between computer graphics and baseball in 20newsgroups dataset categories = ["comp.graphics", "rec.sport.baseball"] for cat in categories: textcat.add_label(cat) # Split train/test and train the model train_x, train_y, test_x, _ = _get_train_test_dataset(categories) train_data = list(zip(train_x, [{"cats": cats} for cats in train_y])) if IS_SPACY_VERSION_NEWER_THAN_OR_EQUAL_TO_3_0_0: from spacy.training import Example train_data = [ Example.from_dict(nlp.make_doc(text), cats) for text, cats in train_data ] _train_model(nlp, train_data) return ModelWithData(nlp, pd.DataFrame(test_x))
def test_attributeruler_score(nlp, pattern_dicts): # initialize with patterns ruler = nlp.add_pipe("attribute_ruler") ruler.initialize(lambda: [], patterns=pattern_dicts) doc = nlp("This is a test.") assert doc[2].lemma_ == "the" assert str(doc[2].morph) == "Case=Nom|Number=Plur" assert doc[3].lemma_ == "cat" assert str(doc[3].morph) == "Case=Nom|Number=Sing" doc = nlp.make_doc("This is a test.") dev_examples = [ Example.from_dict(doc, {"lemmas": ["this", "is", "a", "cat", "."]}) ] scores = nlp.evaluate(dev_examples) # "cat" is the only correct lemma assert scores["lemma_acc"] == pytest.approx(0.2) # no morphs are set assert scores["morph_acc"] is None nlp.remove_pipe("attribute_ruler") # test with custom scorer @registry.misc("weird_scorer.v1") def make_weird_scorer(): def weird_scorer(examples, weird_score, **kwargs): return {"weird_score": weird_score} return weird_scorer ruler = nlp.add_pipe("attribute_ruler", config={"scorer": { "@misc": "weird_scorer.v1" }}) ruler.initialize(lambda: [], patterns=pattern_dicts) scores = nlp.evaluate(dev_examples, scorer_cfg={"weird_score": 0.12345}) assert scores["weird_score"] == 0.12345 assert "token_acc" in scores assert "lemma_acc" not in scores scores = nlp.evaluate(dev_examples, scorer_cfg={"weird_score": 0.23456}) assert scores["weird_score"] == 0.23456
def test_incomplete_data(pipe_name): # Test that the parser works with incomplete information nlp = English() parser = nlp.add_pipe(pipe_name) train_examples = [] for text, annotations in PARTIAL_DATA: train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) for dep in annotations.get("deps", []): if dep is not None: parser.add_label(dep) optimizer = nlp.initialize(get_examples=lambda: train_examples) for i in range(150): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses[pipe_name] < 0.0001 # test the trained model test_text = "I like securities." doc = nlp(test_text) assert doc[0].dep_ == "nsubj" assert doc[2].dep_ == "dobj" assert doc[0].head.i == 1 assert doc[2].head.i == 1
def test_issue7029(): """Test that an empty document doesn't mess up an entire batch.""" TRAIN_DATA = [ ("I like green eggs", { "tags": ["N", "V", "J", "N"] }), ("Eat blue ham", { "tags": ["V", "J", "N"] }), ] nlp = English.from_config(load_config_from_str(CONFIG_7029)) train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) optimizer = nlp.initialize(get_examples=lambda: train_examples) for i in range(50): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) texts = ["first", "second", "third", "fourth", "and", "then", "some", ""] docs1 = list(nlp.pipe(texts, batch_size=1)) docs2 = list(nlp.pipe(texts, batch_size=4)) assert [doc[0].tag_ for doc in docs1[:-1]] == [doc[0].tag_ for doc in docs2[:-1]]
def test_aligned_spans_y2x(en_vocab, en_tokenizer): words = ["Mr and Mrs Smith", "flew", "to", "San Francisco Valley", "."] spaces = [True, True, True, False, False] doc = Doc(en_vocab, words=words, spaces=spaces) prefix = "Mr and Mrs Smith flew to " entities = [ (0, len("Mr and Mrs Smith"), "PERSON"), (len(prefix), len(prefix + "San Francisco Valley"), "LOC"), ] # fmt: off tokens_ref = [ "Mr", "and", "Mrs", "Smith", "flew", "to", "San", "Francisco", "Valley", "." ] # fmt: on example = Example.from_dict(doc, { "words": tokens_ref, "entities": entities }) ents_ref = example.reference.ents assert [(ent.start, ent.end) for ent in ents_ref] == [(0, 4), (6, 9)] ents_y2x = example.get_aligned_spans_y2x(ents_ref) assert [(ent.start, ent.end) for ent in ents_y2x] == [(0, 1), (3, 4)]
def spacy_model() -> spacy.language.Language: examples: t.List[t.Any] = [] model = spacy.blank("en") if "ner" not in model.pipe_names: ner = model.add_pipe("ner", last=True) else: ner = model.get_pipe("ner") for text, annotations in train_data: examples.append(Example.from_dict(model.make_doc(text), annotations)) # noqa for ent in annotations.get("entities"): ner.add_label(ent[2]) other_pipes = [pipe for pipe in model.pipe_names if pipe != "ner"] with model.disable_pipes(*other_pipes): optimizer = model.begin_training() for _ in range(10): random.shuffle(examples) for batch in minibatch(examples, size=8): model.update(batch, sgd=optimizer) return model