def test_kb_serialization(): # Test that the KB can be used in a pipeline with a different vocab vector_length = 3 with make_tempdir() as tmp_dir: kb_dir = tmp_dir / "kb" nlp1 = English() assert "Q2146908" not in nlp1.vocab.strings mykb = KnowledgeBase(nlp1.vocab, entity_vector_length=vector_length) mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) mykb.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8]) assert "Q2146908" in nlp1.vocab.strings mykb.to_disk(kb_dir) nlp2 = English() assert "RandomWord" not in nlp2.vocab.strings nlp2.vocab.strings.add("RandomWord") assert "RandomWord" in nlp2.vocab.strings assert "Q2146908" not in nlp2.vocab.strings # Create the Entity Linker component with the KB from file, and check the final vocab entity_linker = nlp2.add_pipe("entity_linker", last=True) entity_linker.set_kb(load_kb(kb_dir)) assert "Q2146908" in nlp2.vocab.strings assert "RandomWord" in nlp2.vocab.strings
def test_issue4674(): """Test that setting entities with overlapping identifiers does not mess up IO""" nlp = English() kb = KnowledgeBase(nlp.vocab, entity_vector_length=3) vector1 = [0.9, 1.1, 1.01] vector2 = [1.8, 2.25, 2.01] kb.set_entities(entity_list=["Q1", "Q1"], freq_list=[32, 111], vector_list=[vector1, vector2]) assert kb.get_size_entities() == 1 # dumping to file & loading back in with make_tempdir() as d: dir_path = ensure_path(d) if not dir_path.exists(): dir_path.mkdir() file_path = dir_path / "kb" kb.dump(str(file_path)) kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3) kb2.load_bulk(str(file_path)) assert kb2.get_size_entities() == 1
def test_issue5137(): class MyComponent(object): name = "my_component" def __init__(self, nlp, **cfg): self.nlp = nlp self.categories = cfg.get("categories", "all_categories") def __call__(self, doc): pass def to_disk(self, path, **kwargs): pass def from_disk(self, path, **cfg): pass Language.factories["my_component"] = lambda nlp, **cfg: MyComponent(nlp, **cfg) nlp = English() nlp.add_pipe(nlp.create_pipe("my_component")) assert nlp.get_pipe("my_component").categories == "all_categories" with make_tempdir() as tmpdir: nlp.to_disk(tmpdir) nlp2 = spacy.load(tmpdir, categories="my_categories") assert nlp2.get_pipe("my_component").categories == "my_categories"
def test_vocab_serialization(nlp): """Test that string information is retained across storage""" mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities mykb.add_entity(entity="Q1", freq=27, entity_vector=[1]) q2_hash = mykb.add_entity(entity="Q2", freq=12, entity_vector=[2]) mykb.add_entity(entity="Q3", freq=5, entity_vector=[3]) # adding aliases mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1]) adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9]) candidates = mykb.get_alias_candidates("adam") assert len(candidates) == 1 assert candidates[0].entity == q2_hash assert candidates[0].entity_ == "Q2" assert candidates[0].alias == adam_hash assert candidates[0].alias_ == "adam" with make_tempdir() as d: mykb.to_disk(d / "kb") kb_new_vocab = KnowledgeBase(Vocab(), entity_vector_length=1) kb_new_vocab.from_disk(d / "kb") candidates = kb_new_vocab.get_alias_candidates("adam") assert len(candidates) == 1 assert candidates[0].entity == q2_hash assert candidates[0].entity_ == "Q2" assert candidates[0].alias == adam_hash assert candidates[0].alias_ == "adam"
def test_entity_ruler_serialize_dir(nlp, patterns): ruler = nlp.add_pipe("entity_ruler") ruler.add_patterns(patterns) with make_tempdir() as d: ruler.to_disk(d / "test_ruler") ruler.from_disk(d / "test_ruler") # read from an existing directory with pytest.raises(ValueError): ruler.from_disk(d / "non_existing_dir") # read from a bad directory
def test_kb_serialize_2(nlp): v = [5, 6, 7, 8] kb1 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4) kb1.set_entities(["E1"], [1], [v]) assert kb1.get_vector("E1") == v with make_tempdir() as d: kb1.to_disk(d / "kb") kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4) kb2.from_disk(d / "kb") assert kb2.get_vector("E1") == v
def test_span_ruler_serialize_dir(patterns): nlp = spacy.blank("xx") ruler = nlp.add_pipe("span_ruler") ruler.add_patterns(patterns) with make_tempdir() as d: ruler.to_disk(d / "test_ruler") ruler.from_disk(d / "test_ruler") # read from an existing directory with pytest.raises(ValueError): ruler.from_disk(d / "non_existing_dir") # read from a bad directory
def test_entity_ruler_serialize_jsonl(nlp, patterns, entity_ruler_factory): ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") ruler.add_patterns(patterns) with make_tempdir() as d: ruler.to_disk(d / "test_ruler.jsonl") ruler.from_disk(d / "test_ruler.jsonl") # read from an existing jsonl file with pytest.raises(ValueError): ruler.from_disk(d / "non_existing.jsonl") # read from a bad jsonl file
def test_issue4402(): nlp = English() with make_tempdir() as tmpdir: print("temp", tmpdir) json_path = tmpdir / "test4402.json" srsly.write_json(json_path, json_data) corpus = GoldCorpus(str(json_path), str(json_path)) train_docs = list(corpus.train_docs(nlp, gold_preproc=True, max_length=0)) # assert that the data got split into 4 sentences assert len(train_docs) == 4
def test_overfitting_IO(): # Simple test to try and quickly overfit the senter - ensuring the ML models work correctly nlp = English() train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) # add some cases where SENT_START == -1 train_examples[0].reference[10].is_sent_start = False train_examples[1].reference[1].is_sent_start = False train_examples[1].reference[11].is_sent_start = False nlp.add_pipe("senter") optimizer = nlp.initialize() for i in range(200): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses["senter"] < 0.001 # test the trained model test_text = TRAIN_DATA[0][0] doc = nlp(test_text) gold_sent_starts = [0] * 14 gold_sent_starts[0] = 1 gold_sent_starts[5] = 1 gold_sent_starts[9] = 1 assert [int(t.is_sent_start) for t in doc] == gold_sent_starts # Also test the results are still the same after IO with make_tempdir() as tmp_dir: nlp.to_disk(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir) doc2 = nlp2(test_text) assert [int(t.is_sent_start) for t in doc2] == gold_sent_starts # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions texts = [ "Just a sentence.", "Then one more sentence about London.", "Here is another one.", "I like London.", ] batch_deps_1 = [doc.to_array([SENT_START]) for doc in nlp.pipe(texts)] batch_deps_2 = [doc.to_array([SENT_START]) for doc in nlp.pipe(texts)] no_batch_deps = [ doc.to_array([SENT_START]) for doc in [nlp(text) for text in texts] ] assert_equal(batch_deps_1, batch_deps_2) assert_equal(batch_deps_1, no_batch_deps) # test internal pipe labels vs. Language.pipe_labels with hidden labels assert nlp.get_pipe("senter").labels == ("I", "S") assert "senter" not in nlp.pipe_labels
def test_kb_serialize(nlp): """Test serialization of the KB""" mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) with make_tempdir() as d: # normal read-write behaviour mykb.to_disk(d / "kb") mykb.from_disk(d / "kb") mykb.to_disk(d / "new" / "kb") mykb.from_disk(d / "new" / "kb") # allow overwriting an existing file mykb.to_disk(d / "kb") with pytest.raises(ValueError): # can not read from an unknown file mykb.from_disk(d / "unknown" / "kb")
def test_kb_serialize_vocab(nlp): """Test serialization of the KB and custom strings""" entity = "MyFunnyID" assert entity not in nlp.vocab.strings mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) assert not mykb.contains_entity(entity) mykb.add_entity(entity, freq=342, entity_vector=[3]) assert mykb.contains_entity(entity) assert entity in mykb.vocab.strings with make_tempdir() as d: # normal read-write behaviour mykb.to_disk(d / "kb") mykb_new = KnowledgeBase(Vocab(), entity_vector_length=1) mykb_new.from_disk(d / "kb") assert entity in mykb_new.vocab.strings
def test_kb_set_entities(nlp): """Test that set_entities entirely overwrites the previous set of entities""" v = [5, 6, 7, 8] v1 = [1, 1, 1, 0] v2 = [2, 2, 2, 3] kb1 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4) kb1.set_entities(["E0"], [1], [v]) assert kb1.get_entity_strings() == ["E0"] kb1.set_entities(["E1", "E2"], [1, 9], [v1, v2]) assert set(kb1.get_entity_strings()) == {"E1", "E2"} assert kb1.get_vector("E1") == v1 assert kb1.get_vector("E2") == v2 with make_tempdir() as d: kb1.to_disk(d / "kb") kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4) kb2.from_disk(d / "kb") assert set(kb2.get_entity_strings()) == {"E1", "E2"} assert kb2.get_vector("E1") == v1 assert kb2.get_vector("E2") == v2
def test_issue6730(en_vocab): """Ensure that the KB does not accept empty strings, but otherwise IO works fine.""" from spacy.kb import KnowledgeBase kb = KnowledgeBase(en_vocab, entity_vector_length=3) kb.add_entity(entity="1", freq=148, entity_vector=[1, 2, 3]) with pytest.raises(ValueError): kb.add_alias(alias="", entities=["1"], probabilities=[0.4]) assert kb.contains_alias("") is False kb.add_alias(alias="x", entities=["1"], probabilities=[0.2]) kb.add_alias(alias="y", entities=["1"], probabilities=[0.1]) with make_tempdir() as tmp_dir: kb.to_disk(tmp_dir) kb.from_disk(tmp_dir) assert kb.get_size_aliases() == 2 assert set(kb.get_alias_strings()) == {"x", "y"}
def test_issue4054(en_vocab): """Test that a new blank model can be made with a vocab from file, and that serialization does not drop the language at any point.""" nlp1 = English() vocab1 = nlp1.vocab with make_tempdir() as d: vocab_dir = ensure_path(d / "vocab") if not vocab_dir.exists(): vocab_dir.mkdir() vocab1.to_disk(vocab_dir) vocab2 = Vocab().from_disk(vocab_dir) print("lang", vocab2.lang) nlp2 = spacy.blank("en", vocab=vocab2) nlp_dir = ensure_path(d / "nlp") if not nlp_dir.exists(): nlp_dir.mkdir() nlp2.to_disk(nlp_dir) nlp3 = spacy.load(nlp_dir) assert nlp3.lang == "en"
def test_issue4190(): test_string = "Test c." # Load default language nlp_1 = English() doc_1a = nlp_1(test_string) result_1a = [token.text for token in doc_1a] # Modify tokenizer customize_tokenizer(nlp_1) doc_1b = nlp_1(test_string) result_1b = [token.text for token in doc_1b] # Save and Reload with make_tempdir() as model_dir: nlp_1.to_disk(model_dir) nlp_2 = spacy.load(model_dir) # This should be the modified tokenizer doc_2 = nlp_2(test_string) result_2 = [token.text for token in doc_2] assert result_1b == result_2
def test_overfitting_IO(): # Simple test to try and quickly overfit the morphologizer - ensuring the ML models work correctly nlp = English() nlp.add_pipe("morphologizer") train_examples = [] for inst in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1])) optimizer = nlp.initialize(get_examples=lambda: train_examples) for i in range(50): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses["morphologizer"] < 0.00001 # test the trained model test_text = "I like blue ham" doc = nlp(test_text) gold_morphs = ["Feat=N", "Feat=V", "", ""] gold_pos_tags = ["NOUN", "VERB", "ADJ", ""] assert [str(t.morph) for t in doc] == gold_morphs assert [t.pos_ for t in doc] == gold_pos_tags # Also test the results are still the same after IO with make_tempdir() as tmp_dir: nlp.to_disk(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir) doc2 = nlp2(test_text) assert [str(t.morph) for t in doc2] == gold_morphs assert [t.pos_ for t in doc2] == gold_pos_tags # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions texts = [ "Just a sentence.", "Then one more sentence about London.", "Here is another one.", "I like London.", ] batch_deps_1 = [doc.to_array([MORPH]) for doc in nlp.pipe(texts)] batch_deps_2 = [doc.to_array([MORPH]) for doc in nlp.pipe(texts)] no_batch_deps = [ doc.to_array([MORPH]) for doc in [nlp(text) for text in texts] ] assert_equal(batch_deps_1, batch_deps_2) assert_equal(batch_deps_1, no_batch_deps) # Test without POS nlp.remove_pipe("morphologizer") nlp.add_pipe("morphologizer") for example in train_examples: for token in example.reference: token.pos_ = "" optimizer = nlp.initialize(get_examples=lambda: train_examples) for i in range(50): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses["morphologizer"] < 0.00001 # Test the trained model test_text = "I like blue ham" doc = nlp(test_text) gold_morphs = ["Feat=N", "Feat=V", "", ""] gold_pos_tags = ["", "", "", ""] assert [str(t.morph) for t in doc] == gold_morphs assert [t.pos_ for t in doc] == gold_pos_tags # Test with unset morph and partial POS nlp.remove_pipe("morphologizer") nlp.add_pipe("morphologizer") for example in train_examples: for token in example.reference: if token.text == "ham": token.pos_ = "NOUN" else: token.pos_ = "" token.set_morph(None) optimizer = nlp.initialize(get_examples=lambda: train_examples) print(nlp.get_pipe("morphologizer").labels) for i in range(50): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses["morphologizer"] < 0.00001 # Test the trained model test_text = "I like blue ham" doc = nlp(test_text) gold_morphs = ["", "", "", ""] gold_pos_tags = ["NOUN", "NOUN", "NOUN", "NOUN"] assert [str(t.morph) for t in doc] == gold_morphs assert [t.pos_ for t in doc] == gold_pos_tags
def test_overfitting_IO(): # Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly nlp = English() vector_length = 3 assert "Q2146908" not in nlp.vocab.strings # Convert the texts to docs to make sure we have doc.ents set for the training examples train_examples = [] for text, annotation in TRAIN_DATA: doc = nlp(text) train_examples.append(Example.from_dict(doc, annotation)) def create_kb(vocab): # create artificial KB - assign same prior weight to the two russ cochran's # Q2146908 (Russ Cochran): American golfer # Q7381115 (Russ Cochran): publisher mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7]) mykb.add_alias( alias="Russ Cochran", entities=["Q2146908", "Q7381115"], probabilities=[0.5, 0.5], ) return mykb # Create the Entity Linker component and add it to the pipeline entity_linker = nlp.add_pipe("entity_linker", last=True) entity_linker.set_kb(create_kb) assert "Q2146908" in entity_linker.vocab.strings assert "Q2146908" in entity_linker.kb.vocab.strings # train the NEL pipe optimizer = nlp.initialize(get_examples=lambda: train_examples) assert entity_linker.model.get_dim("nO") == vector_length assert entity_linker.model.get_dim( "nO") == entity_linker.kb.entity_vector_length for i in range(50): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses["entity_linker"] < 0.001 # adding additional components that are required for the entity_linker nlp.add_pipe("sentencizer", first=True) # Add a custom component to recognize "Russ Cochran" as an entity for the example training data patterns = [{ "label": "PERSON", "pattern": [{ "LOWER": "russ" }, { "LOWER": "cochran" }] }] ruler = nlp.add_pipe("entity_ruler", before="entity_linker") ruler.add_patterns(patterns) # test the trained model predictions = [] for text, annotation in TRAIN_DATA: doc = nlp(text) for ent in doc.ents: predictions.append(ent.kb_id_) assert predictions == GOLD_entities # Also test the results are still the same after IO with make_tempdir() as tmp_dir: nlp.to_disk(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir) assert nlp2.pipe_names == nlp.pipe_names assert "Q2146908" in nlp2.vocab.strings entity_linker2 = nlp2.get_pipe("entity_linker") assert "Q2146908" in entity_linker2.vocab.strings assert "Q2146908" in entity_linker2.kb.vocab.strings predictions = [] for text, annotation in TRAIN_DATA: doc2 = nlp2(text) for ent in doc2.ents: predictions.append(ent.kb_id_) assert predictions == GOLD_entities # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions texts = [ "Russ Cochran captured his first major title with his son as caddie.", "Russ Cochran his reprints include EC Comics.", "Russ Cochran has been publishing comic art.", "Russ Cochran was a member of University of Kentucky's golf team.", ] batch_deps_1 = [doc.to_array([ENT_KB_ID]) for doc in nlp.pipe(texts)] batch_deps_2 = [doc.to_array([ENT_KB_ID]) for doc in nlp.pipe(texts)] no_batch_deps = [ doc.to_array([ENT_KB_ID]) for doc in [nlp(text) for text in texts] ] assert_equal(batch_deps_1, batch_deps_2) assert_equal(batch_deps_1, no_batch_deps)