def test_pretraining_tagger_tok2vec(config): """Test pretraining of the tagger's tok2vec layer (via a listener)""" config = Config().from_str(pretrain_string_listener) nlp = util.load_model_from_config(config, auto_fill=True, validate=False) filled = nlp.config pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH) filled = pretrain_config.merge(filled) with make_tempdir() as tmp_dir: file_path = write_sample_jsonl(tmp_dir) filled["paths"]["raw_text"] = file_path filled["pretraining"]["component"] = "tagger" filled["pretraining"]["layer"] = "tok2vec" filled = filled.interpolate() pretrain(filled, tmp_dir) assert Path(tmp_dir / "model0.bin").exists() assert Path(tmp_dir / "model4.bin").exists() assert not Path(tmp_dir / "model5.bin").exists()
def test_initialized_inline_transformer_todisk(): orig_config = Config().from_str(inline_cfg_string) nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) assert nlp.pipe_names == ["tagger"] tagger = nlp.get_pipe("tagger") tagger.add_label("V") nlp.initialize() with make_tempdir() as d: tagger.to_disk(d) nlp2 = util.load_model_from_config(orig_config, auto_fill=True, validate=True) tagger2 = nlp2.get_pipe("tagger") tagger2.from_disk(d) assert list(tagger2.labels) == ["V"]
def test_transformer_sentencepiece_IO(): """Test that a transformer using sentencepiece trains + IO goes OK""" orig_config = Config().from_str(cfg_string) orig_config["components"]["transformer"]["model"]["name"] = "camembert-base" nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) tagger = nlp.get_pipe("tagger") tagger_trf = tagger.model.get_ref("tok2vec").layers[0] train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) for tag in t[1]["tags"]: tagger.add_label(tag) optimizer = nlp.initialize(lambda: train_examples) for i in range(2): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) text = "We're interested at underwater basket weaving." doc = nlp(text) doc_tensor = tagger_trf.predict([doc]) # ensure IO goes OK with make_tempdir() as d: file_path = d / "trained_nlp" nlp.to_disk(file_path) nlp2 = util.load_model_from_path(file_path) doc2 = nlp2(text) tagger2 = nlp2.get_pipe("tagger") tagger_trf2 = tagger2.model.get_ref("tok2vec").layers[0] doc_tensor2 = tagger_trf2.predict([doc2]) _assert_equal_tensors(doc_tensor2[0].tensors, doc_tensor[0].tensors) # make sure that this can be saved to directory once more file_path_2 = d / "trained_nlp_2" nlp2.to_disk(file_path_2) # ensure to_bytes / from_bytes works nlp_bytes = nlp.to_bytes() nlp3 = util.load_model_from_config(orig_config, auto_fill=True, validate=True) nlp3.from_bytes(nlp_bytes) doc3 = nlp3(text) tagger3 = nlp3.get_pipe("tagger") tagger_trf3 = tagger3.model.get_ref("tok2vec").layers[0] doc_tensor3 = tagger_trf3.predict([doc3]) _assert_equal_tensors(doc_tensor3[0].tensors, doc_tensor[0].tensors)
def test_serialize_parser(parser_config_string): """ Create a non-default parser config to check nlp serializes it correctly """ nlp = English() model_config = Config().from_str(parser_config_string) parser = nlp.add_pipe("parser", config=model_config) parser.add_label("nsubj") nlp.initialize() with make_tempdir() as d: nlp.to_disk(d) nlp2 = spacy.load(d) model = nlp2.get_pipe("parser").model model.get_ref("tok2vec") # check that we have the correct settings, not the default ones if model.attrs["has_upper"]: assert model.get_ref("upper").get_dim("nI") == 66 assert model.get_ref("lower").get_dim("nI") == 66
def test_replace_listeners_invalid(): orig_config = Config().from_str(cfg_string) nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) text = "This is awesome" examples = [Example.from_dict(nlp.make_doc(text), {"tags": ["A", "B", "C"]})] optimizer = nlp.initialize(lambda: examples) for i in range(2): losses = {} nlp.update(examples, sgd=optimizer, losses=losses) with pytest.raises(ValueError): nlp.replace_listeners("invalid", "tagger", ["model.tok2vec"]) with pytest.raises(ValueError): nlp.replace_listeners("transformer", "parser", ["model.tok2vec"]) with pytest.raises(ValueError): nlp.replace_listeners("transformer", "tagger", ["model.yolo"]) with pytest.raises(ValueError): nlp.replace_listeners("transformer", "tagger", ["model.tok2vec", "model.yolo"])
def test_tok2vec_listener_callback(): orig_config = Config().from_str(cfg_string) nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) assert nlp.pipe_names == ["tok2vec", "tagger"] tagger = nlp.get_pipe("tagger") tok2vec = nlp.get_pipe("tok2vec") nlp._link_components() docs = [nlp.make_doc("A random sentence")] tok2vec.model.initialize(X=docs) gold_array = [[1.0 for tag in ["V", "Z"]] for word in docs] label_sample = [tagger.model.ops.asarray(gold_array, dtype="float32")] tagger.model.initialize(X=docs, Y=label_sample) docs = [nlp.make_doc("Another entirely random sentence")] tok2vec.update([Example.from_dict(x, {}) for x in docs]) Y, get_dX = tagger.model.begin_update(docs) # assure that the backprop call works (and doesn't hit a 'None' callback) assert get_dX(Y) is not None
def test_replace_listeners(): orig_config = Config().from_str(cfg_string) nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) text = "This is awesome" examples = [ Example.from_dict(nlp.make_doc(text), {"tags": ["A", "B", "C"]}) ] optimizer = nlp.initialize(lambda: examples) # verify correct configuration with transformer listener transformer = nlp.get_pipe("transformer") tagger = nlp.get_pipe("tagger") tagger_tok2vec = tagger.model.get_ref("tok2vec") tagger_listener = tagger_tok2vec.get_ref("listener") assert isinstance(tagger_listener, TransformerListener) assert transformer.listener_map["tagger"][0] == tagger_listener assert (nlp.config["components"]["transformer"]["model"]["@architectures"] == "spacy-transformers.TransformerModel.v1") assert (nlp.config["components"]["tagger"]["model"]["tok2vec"] ["@architectures"] == "spacy-transformers.TransformerListener.v1") # train pipe before replacing listeners for i in range(2): losses = {} nlp.update(examples, sgd=optimizer, losses=losses) doc = nlp(text) preds = [t.tag_ for t in doc] doc_tensor = tagger_tok2vec.predict([doc]) # replace listener and verify predictions are still the same nlp.replace_listeners("transformer", "tagger", ["model.tok2vec"]) tagger = nlp.get_pipe("tagger") tagger_tok2vec = tagger.model.get_ref("tok2vec") assert tagger_tok2vec.layers[0].layers[0].name == "transformer" assert (nlp.config["components"]["tagger"]["model"]["tok2vec"] ["@architectures"] == "spacy-transformers.Tok2VecTransformer.v1") doc2 = nlp(text) assert preds == [t.tag_ for t in doc2] assert_equal(doc_tensor, tagger_tok2vec.predict([doc2])) # attempt training with the new pipeline optimizer = nlp.resume_training() for i in range(2): losses = {} nlp.update(examples, sgd=optimizer, losses=losses) assert losses["tagger"] > 0.0
def test_pretraining_tok2vec_characters(objective): """Test that pretraining works with the character objective""" config = Config().from_str(pretrain_string_listener) config["pretraining"]["objective"] = objective nlp = util.load_model_from_config(config, auto_fill=True, validate=False) filled = nlp.config pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH) filled = pretrain_config.merge(filled) with make_tempdir() as tmp_dir: file_path = write_sample_jsonl(tmp_dir) filled["paths"]["raw_text"] = file_path filled = filled.interpolate() assert filled["pretraining"]["component"] == "tok2vec" pretrain(filled, tmp_dir) assert Path(tmp_dir / "model0.bin").exists() assert Path(tmp_dir / "model4.bin").exists() assert not Path(tmp_dir / "model5.bin").exists()
def get_third_party_dependencies( config: Config, exclude: List[str] = util.SimpleFrozenList()) -> List[str]: """If the config includes references to registered functions that are provided by third-party packages (spacy-transformers, other libraries), we want to include them in meta["requirements"] so that the package specifies them as dependencies and the user won't have to do it manually. We do this by: - traversing the config to check for registered function (@ keys) - looking up the functions and getting their module - looking up the module version and generating an appropriate version range config (Config): The pipeline config. exclude (list): List of packages to exclude (e.g. that already exist in meta). RETURNS (list): The versioned requirements. """ own_packages = ("spacy", "spacy-legacy", "spacy-nightly", "thinc", "srsly") distributions = util.packages_distributions() funcs = defaultdict(set) for path, value in util.walk_dict(config): if path[-1].startswith( "@"): # collect all function references by registry funcs[path[-1][1:]].add(value) for component in config.get("components", {}).values(): if "factory" in component: funcs["factories"].add(component["factory"]) modules = set() for reg_name, func_names in funcs.items(): for func_name in func_names: func_info = util.registry.find(reg_name, func_name) module_name = func_info.get("module") if module_name: # the code is part of a module, not a --code file modules.add(func_info["module"].split(".")[0]) dependencies = [] for module_name in modules: if module_name in distributions: dist = distributions.get(module_name) if dist: pkg = dist[0] if pkg in own_packages or pkg in exclude: continue version = util.get_package_version(pkg) version_range = util.get_minor_version_range(version) dependencies.append(f"{pkg}{version_range}") return dependencies
def test_pretraining_training(): """Test that training can use a pretrained Tok2Vec model""" config = Config().from_str(pretrain_string_internal) nlp = util.load_model_from_config(config, auto_fill=True, validate=False) filled = nlp.config pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH) filled = pretrain_config.merge(filled) train_config = util.load_config(DEFAULT_CONFIG_PATH) filled = train_config.merge(filled) with make_tempdir() as tmp_dir: pretrain_dir = tmp_dir / "pretrain" pretrain_dir.mkdir() file_path = write_sample_jsonl(pretrain_dir) filled["paths"]["raw_text"] = file_path filled["pretraining"]["component"] = "tagger" filled["pretraining"]["layer"] = "tok2vec" train_dir = tmp_dir / "train" train_dir.mkdir() train_path, dev_path = write_sample_training(train_dir) filled["paths"]["train"] = train_path filled["paths"]["dev"] = dev_path filled = filled.interpolate() P = filled["pretraining"] nlp_base = init_nlp(filled) model_base = (nlp_base.get_pipe(P["component"]).model.get_ref( P["layer"]).get_ref("embed")) embed_base = None for node in model_base.walk(): if node.name == "hashembed": embed_base = node pretrain(filled, pretrain_dir) pretrained_model = Path(pretrain_dir / "model3.bin") assert pretrained_model.exists() filled["initialize"]["init_tok2vec"] = str(pretrained_model) nlp = init_nlp(filled) model = nlp.get_pipe(P["component"]).model.get_ref( P["layer"]).get_ref("embed") embed = None for node in model.walk(): if node.name == "hashembed": embed = node # ensure that the tok2vec weights are actually changed by the pretraining assert np.any( np.not_equal(embed.get_param("E"), embed_base.get_param("E"))) train(nlp, train_dir)
def test_serialize_nlp(): """ Create a custom nlp pipeline from config and ensure it serializes it correctly """ nlp_config = Config().from_str(nlp_config_string) nlp = load_model_from_config(nlp_config, auto_fill=True) nlp.get_pipe("tagger").add_label("A") nlp.initialize() assert "tok2vec" in nlp.pipe_names assert "tagger" in nlp.pipe_names assert "parser" not in nlp.pipe_names assert nlp.get_pipe("tagger").model.get_ref("tok2vec").get_dim("nO") == 342 with make_tempdir() as d: nlp.to_disk(d) nlp2 = spacy.load(d) assert "tok2vec" in nlp2.pipe_names assert "tagger" in nlp2.pipe_names assert "parser" not in nlp2.pipe_names assert nlp2.get_pipe("tagger").model.get_ref("tok2vec").get_dim("nO") == 342
def test_annotating_components_from_config(config_str): @registry.readers("unannotated_corpus") def create_unannotated_corpus() -> Callable[[Language], Iterable[Example]]: return UnannotatedCorpus() class UnannotatedCorpus: def __call__(self, nlp: Language) -> Iterator[Example]: for text in ["a a", "b b", "c c"]: doc = nlp.make_doc(text) yield Example(doc, doc) orig_config = Config().from_str(config_str) nlp = load_model_from_config(orig_config, auto_fill=True, validate=True) assert nlp.config["training"]["annotating_components"] == ["sentencizer"] train(nlp) nlp.config["training"]["annotating_components"] = [] with pytest.raises(ValueError): train(nlp)
def test_create_nlp_from_config_multiple_instances(): """Test that the nlp object is created correctly for a config with multiple instances of the same component.""" config = Config().from_str(nlp_config_string) config["components"] = { "t2v": config["components"]["tok2vec"], "tagger1": config["components"]["tagger"], "tagger2": config["components"]["tagger"], } config["nlp"]["pipeline"] = list(config["components"].keys()) nlp = load_model_from_config(config, auto_fill=True) assert nlp.pipe_names == ["t2v", "tagger1", "tagger2"] assert nlp.get_pipe_meta("t2v").factory == "tok2vec" assert nlp.get_pipe_meta("tagger1").factory == "tagger" assert nlp.get_pipe_meta("tagger2").factory == "tagger" pipeline_config = nlp.config["components"] assert len(pipeline_config) == 3 assert list(pipeline_config.keys()) == ["t2v", "tagger1", "tagger2"] assert nlp.config["nlp"]["pipeline"] == ["t2v", "tagger1", "tagger2"]
def test_transformer_pipeline_tagger(): """Test that a pipeline with just a transformer+tagger runs and trains properly""" orig_config = Config().from_str(cfg_string) nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) assert nlp.pipe_names == ["transformer", "tagger"] tagger = nlp.get_pipe("tagger") transformer = nlp.get_pipe("transformer") tagger_trf = tagger.model.get_ref("tok2vec").layers[0] assert isinstance(transformer, Transformer) assert isinstance(tagger_trf, TransformerListener) assert tagger_trf.upstream_name == "custom_upstream" train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) for tag in t[1]["tags"]: tagger.add_label(tag) # Check that the Transformer component finds it listeners assert transformer.listeners == [] optimizer = nlp.initialize(lambda: train_examples) assert tagger_trf in transformer.listeners for i in range(2): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) doc = nlp("We're interested at underwater basket weaving.") doc_tensor = tagger_trf.predict([doc]) assert_equal(doc._.trf_data.tensors, doc_tensor[0].tensors) # ensure IO goes OK with make_tempdir() as d: file_path = d / "trained_nlp" nlp.to_disk(file_path) nlp2 = util.load_model_from_path(file_path) doc = nlp2("We're interested at underwater basket weaving.") tagger2 = nlp2.get_pipe("tagger") tagger_trf2 = tagger2.model.get_ref("tok2vec").layers[0] doc_tensor2 = tagger_trf2.predict([doc]) assert_equal(doc_tensor2[0].tensors, doc_tensor[0].tensors)
def test_replace_listeners_from_config(): orig_config = Config().from_str(cfg_string_multi) nlp = util.load_model_from_config(orig_config, auto_fill=True) annots = {"tags": ["V", "Z"], "entities": [(0, 1, "A"), (1, 2, "B")]} examples = [Example.from_dict(nlp.make_doc("x y"), annots)] nlp.initialize(lambda: examples) tok2vec = nlp.get_pipe("tok2vec") tagger = nlp.get_pipe("tagger") ner = nlp.get_pipe("ner") assert tok2vec.listening_components == ["tagger", "ner"] assert any(isinstance(node, Tok2VecListener) for node in ner.model.walk()) assert any(isinstance(node, Tok2VecListener) for node in tagger.model.walk()) with make_tempdir() as dir_path: nlp.to_disk(dir_path) base_model = str(dir_path) new_config = { "nlp": {"lang": "en", "pipeline": ["tok2vec", "tagger", "ner"]}, "components": { "tok2vec": {"source": base_model}, "tagger": { "source": base_model, "replace_listeners": ["model.tok2vec"], }, "ner": {"source": base_model}, }, } new_nlp = util.load_model_from_config(new_config, auto_fill=True) new_nlp.initialize(lambda: examples) tok2vec = new_nlp.get_pipe("tok2vec") tagger = new_nlp.get_pipe("tagger") ner = new_nlp.get_pipe("ner") assert tok2vec.listening_components == ["ner"] assert any(isinstance(node, Tok2VecListener) for node in ner.model.walk()) assert not any(isinstance(node, Tok2VecListener) for node in tagger.model.walk()) t2v_cfg = new_nlp.config["components"]["tok2vec"]["model"] assert t2v_cfg["@architectures"] == "spacy.Tok2Vec.v2" assert new_nlp.config["components"]["tagger"]["model"]["tok2vec"] == t2v_cfg assert ( new_nlp.config["components"]["ner"]["model"]["tok2vec"]["@architectures"] == "spacy.Tok2VecListener.v1" )
def test_transformer_pipeline_empty(): """Test that the pipeline doesn't fail with empty input""" orig_config = Config().from_str(cfg_string) nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) tagger = nlp.get_pipe("tagger") train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) for tag in t[1]["tags"]: tagger.add_label(tag) # train on empty doc optimizer = nlp.initialize() losses = {} empty_train_example = Example.from_dict(nlp.make_doc(""), {}) nlp.update(train_examples, sgd=optimizer, losses=losses) nlp.update([empty_train_example], sgd=optimizer, losses=losses) train_examples.append(empty_train_example) nlp.update(train_examples, sgd=optimizer, losses=losses) # predict empty doc doc = nlp("") _assert_empty(doc._.trf_data) docs = nlp.pipe(["", ""]) for doc in docs: _assert_empty(doc._.trf_data) nlp.pipe([]) # predict combination of empty and non-empty doc = nlp("This is a sentence") normal_tags = [t.tag_ for t in doc] docs = list(nlp.pipe(["", "This is a sentence", "", ""])) _assert_empty(docs[0]._.trf_data) assert [t.tag_ for t in docs[0]] == [] assert [t.tag_ for t in docs[1]] == normal_tags _assert_empty(docs[2]._.trf_data) _assert_empty(docs[3]._.trf_data)
def test_config_interpolation(d): """Test that config values are interpolated correctly. The parametrized value is the final divider (${a.b} vs. ${a:b}). Both should now work and be valid. The double {{ }} in the config strings are required to prevent the references from being interpreted as an actual f-string variable. """ c_str = """[a]\nfoo = "hello"\n\n[b]\nbar = ${foo}""" with pytest.raises(ConfigValidationError): Config().from_str(c_str) c_str = f"""[a]\nfoo = "hello"\n\n[b]\nbar = ${{a{d}foo}}""" assert Config().from_str(c_str)["b"]["bar"] == "hello" c_str = f"""[a]\nfoo = "hello"\n\n[b]\nbar = ${{a{d}foo}}!""" assert Config().from_str(c_str)["b"]["bar"] == "hello!" c_str = f"""[a]\nfoo = "hello"\n\n[b]\nbar = "${{a{d}foo}}!\"""" assert Config().from_str(c_str)["b"]["bar"] == "hello!" c_str = f"""[a]\nfoo = 15\n\n[b]\nbar = ${{a{d}foo}}!""" assert Config().from_str(c_str)["b"]["bar"] == "15!" c_str = f"""[a]\nfoo = ["x", "y"]\n\n[b]\nbar = ${{a{d}foo}}""" assert Config().from_str(c_str)["b"]["bar"] == ["x", "y"] # Interpolation within the same section c_str = f"""[a]\nfoo = "x"\nbar = ${{a{d}foo}}\nbaz = "${{a{d}foo}}y\"""" assert Config().from_str(c_str)["a"]["bar"] == "x" assert Config().from_str(c_str)["a"]["baz"] == "xy"
def test_config_custom_sort_preserve(): """Test that sort order is preserved when merging and copying configs, or when configs are filled and resolved.""" cfg = {"x": {}, "y": {}, "z": {}} section_order = ["y", "z", "x"] expected = "[y]\n\n[z]\n\n[x]" config = Config(cfg, section_order=section_order) assert config.to_str() == expected config2 = config.copy() assert config2.to_str() == expected config3 = config.merge({"a": {}}) assert config3.to_str() == f"{expected}\n\n[a]" config4 = Config(config) assert config4.to_str() == expected config_str = """[a]\nb = 1\n[c]\n@cats = "catsie.v1"\nevil = true\n\n[t]\n x = 2""" section_order = ["c", "a", "t"] config5 = Config(section_order=section_order).from_str(config_str) assert list(config5.keys()) == section_order filled = my_registry.fill_config(config5) assert filled.section_order == section_order
def test_positional_args_to_from_string(): cfg = """[a]\nb = 1\n* = ["foo","bar"]""" assert Config().from_str(cfg).to_str() == cfg cfg = """[a]\nb = 1\n\n[a.*.foo]\ntest = 1\n\n[a.*.bar]\ntest = 2""" assert Config().from_str(cfg).to_str() == cfg @my_registry.cats("catsie.v666") def catsie_666(*args, meow=False): return args cfg = """[a]\n@cats = "catsie.v666"\n* = ["foo","bar"]""" filled = my_registry.fill_config(Config().from_str(cfg)).to_str() assert filled == """[a]\n@cats = "catsie.v666"\n* = ["foo","bar"]\nmeow = false""" assert my_registry.make_from_config(Config().from_str(cfg)) == { "a": ("foo", "bar") } cfg = """[a]\n@cats = "catsie.v666"\n\n[a.*.foo]\nx = 1""" filled = my_registry.fill_config(Config().from_str(cfg)).to_str() assert filled == """[a]\n@cats = "catsie.v666"\nmeow = false\n\n[a.*.foo]\nx = 1""" assert my_registry.make_from_config(Config().from_str(cfg)) == { "a": ({ "x": 1 }, ) } @my_registry.cats("catsie.v777") def catsie_777(y: int = 1): return "meow" * y cfg = """[a]\n@cats = "catsie.v666"\n\n[a.*.foo]\n@cats = "catsie.v777\"""" filled = my_registry.fill_config(Config().from_str(cfg)).to_str() expected = """[a]\n@cats = "catsie.v666"\nmeow = false\n\n[a.*.foo]\n@cats = "catsie.v777"\ny = 1""" assert filled == expected cfg = """[a]\n@cats = "catsie.v666"\n\n[a.*.foo]\n@cats = "catsie.v777"\ny = 3""" result = my_registry.make_from_config(Config().from_str(cfg)) assert result == {"a": ("meowmeowmeow", )}
def main(numpy: bool = False, pytorch: bool = False, generic: bool = False, gpu_id: int = -1): global CONFIG fix_random_seed(0) if gpu_id >= 0: require_gpu(gpu_id) print("Set GPU", gpu_id) backends = {"pytorch": pytorch, "numpy": numpy, "generic": generic} for name, use_backend in backends.items(): if not use_backend: print(f"Skipping {name}") continue set_backend(name, gpu_id) print("Getting data") C = registry.resolve(Config().from_str(CONFIG)) model = C["model"] X, Y = get_dummy_data(**C["data"]) print("Copy to device") X = [model.ops.asarray(x) for x in X] Y = [model.ops.asarray(y) for y in Y] print("Begin init", len(X)) model.initialize(X=X[:5]) print("Pre-batch") n_words = sum(len(x) for x in X) batches = model.ops.multibatch(16, X, Y) batches = [(model.layers[0].predict(x), y) for x, y in batches] model.layers.pop(0) print("Start") start_time = timer() total = run_forward(model, [x for x, y in batches]) end_time = timer() print(name, n_words, total, end_time - start_time) start_time = timer() total = run_forward_backward(model, batches) end_time = timer() print(name, n_words, total, end_time - start_time)
def test_replace_listeners(): orig_config = Config().from_str(cfg_string) nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) examples = [Example.from_dict(nlp.make_doc("x y"), {"tags": ["V", "Z"]})] nlp.initialize(lambda: examples) tok2vec = nlp.get_pipe("tok2vec") tagger = nlp.get_pipe("tagger") assert isinstance(tagger.model.layers[0], Tok2VecListener) assert tok2vec.listener_map["tagger"][0] == tagger.model.layers[0] assert ( nlp.config["components"]["tok2vec"]["model"]["@architectures"] == "spacy.Tok2Vec.v2" ) assert ( nlp.config["components"]["tagger"]["model"]["tok2vec"]["@architectures"] == "spacy.Tok2VecListener.v1" ) nlp.replace_listeners("tok2vec", "tagger", ["model.tok2vec"]) assert not isinstance(tagger.model.layers[0], Tok2VecListener) t2v_cfg = nlp.config["components"]["tok2vec"]["model"] assert t2v_cfg["@architectures"] == "spacy.Tok2Vec.v2" assert nlp.config["components"]["tagger"]["model"]["tok2vec"] == t2v_cfg with pytest.raises(ValueError): nlp.replace_listeners("invalid", "tagger", ["model.tok2vec"]) with pytest.raises(ValueError): nlp.replace_listeners("tok2vec", "parser", ["model.tok2vec"]) with pytest.raises(ValueError): nlp.replace_listeners("tok2vec", "tagger", ["model.yolo"]) with pytest.raises(ValueError): nlp.replace_listeners("tok2vec", "tagger", ["model.tok2vec", "model.yolo"]) # attempt training with the new pipeline optimizer = nlp.initialize(lambda: examples) for i in range(2): losses = {} nlp.update(examples, sgd=optimizer, losses=losses) assert losses["tok2vec"] == 0.0 assert losses["tagger"] > 0.0
def test_config_deep_merge(): config = {"a": "hello", "b": {"c": "d"}} defaults = {"a": "world", "b": {"c": "e", "f": "g"}} merged = Config(defaults).merge(config) assert len(merged) == 2 assert merged["a"] == "hello" assert merged["b"] == {"c": "d", "f": "g"} config = {"a": "hello", "b": {"@test": "x", "foo": 1}} defaults = {"a": "world", "b": {"@test": "x", "foo": 100, "bar": 2}, "c": 100} merged = Config(defaults).merge(config) assert len(merged) == 3 assert merged["a"] == "hello" assert merged["b"] == {"@test": "x", "foo": 1, "bar": 2} assert merged["c"] == 100 config = {"a": "hello", "b": {"@test": "x", "foo": 1}, "c": 100} defaults = {"a": "world", "b": {"@test": "y", "foo": 100, "bar": 2}} merged = Config(defaults).merge(config) assert len(merged) == 3 assert merged["a"] == "hello" assert merged["b"] == {"@test": "x", "foo": 1} assert merged["c"] == 100 # Test that leaving out the factory just adds to existing config = {"a": "hello", "b": {"foo": 1}, "c": 100} defaults = {"a": "world", "b": {"@test": "y", "foo": 100, "bar": 2}} merged = Config(defaults).merge(config) assert len(merged) == 3 assert merged["a"] == "hello" assert merged["b"] == {"@test": "y", "foo": 1, "bar": 2} assert merged["c"] == 100 # Test that switching to a different factory prevents the default from being added config = {"a": "hello", "b": {"@foo": 1}, "c": 100} defaults = {"a": "world", "b": {"@bar": "y"}} merged = Config(defaults).merge(config) assert len(merged) == 3 assert merged["a"] == "hello" assert merged["b"] == {"@foo": 1} assert merged["c"] == 100 config = {"a": "hello", "b": {"@foo": 1}, "c": 100} defaults = {"a": "world", "b": "y"} merged = Config(defaults).merge(config) assert len(merged) == 3 assert merged["a"] == "hello" assert merged["b"] == {"@foo": 1} assert merged["c"] == 100
def get_nlps(language: str, *, add_coreferee: bool = True) -> list: """ Returns a list of *nlp* objects to use when testing the functionality for *language*. The list contains the latest versions of the Spacy models named in the config file. Note that if this method is called with *add_coreferee=False*, this setting will apply to all future calls within the same process space. This means that *add_coreferee=False* is only appropriate during development of rules tests and before any smoke tests are required.""" with lock: if language not in language_to_nlps: relative_config_filename = sep.join( ('lang', language, 'config.cfg')) if not pkg_resources.resource_exists('coreferee', relative_config_filename): raise LanguageNotSupportedError(language) absolute_config_filename = pkg_resources.resource_filename( __name__, relative_config_filename) config = Config().from_disk(absolute_config_filename) model_set = set() for config_entry in config: model_set.add('_'.join( (language, config[config_entry]['model']))) nlps = [] for model in model_set: # At present we presume there will never be an entry in the config file that # specifies a model name that can no longer be loaded. This seems a reasonable # assumption, but if it no longer applies this code will need to be changed in the # future. nlp = spacy.load(model) if add_coreferee: nlp.add_pipe('coreferee') nlps.append(nlp) nlps = sorted(nlps, key=lambda nlp: (nlp.meta['name'], nlp.meta['version'])) language_to_nlps[language] = nlps return language_to_nlps[language]
def test_config_to_str_roundtrip(): cfg = {"cfg": {"foo": False}} config_str = Config(cfg).to_str() assert config_str == "[cfg]\nfoo = false" config = Config().from_str(config_str) assert dict(config) == cfg cfg = {"cfg": {"foo": "false"}} config_str = Config(cfg).to_str() assert config_str == '[cfg]\nfoo = "false"' config = Config().from_str(config_str) assert dict(config) == cfg # Bad non-serializable value cfg = {"cfg": {"x": numpy.asarray([[1, 2, 3, 4], [4, 5, 3, 4]], dtype="f")}} config = Config(cfg) with pytest.raises(ConfigValidationError): config.to_str() # Roundtrip with variables: preserve variables correctly (quoted/unquoted) config_str = """[a]\nb = 1\n\n[c]\nd = ${a:b}\ne = \"hello${a:b}"\nf = "${a:b}\"""" config = Config().from_str(config_str, interpolate=False) assert config.to_str() == config_str
from ..vocab import Vocab from ..language import Language from ..errors import Errors default_model_config = """ [model] @architectures = "spacy.HashEmbedCNN.v2" pretrained_vectors = null width = 96 depth = 4 embed_size = 2000 window_size = 1 maxout_pieces = 3 subword_features = true """ DEFAULT_TOK2VEC_MODEL = Config().from_str(default_model_config)["model"] @Language.factory( "tok2vec", assigns=["doc.tensor"], default_config={"model": DEFAULT_TOK2VEC_MODEL} ) def make_tok2vec(nlp: Language, name: str, model: Model) -> "Tok2Vec": return Tok2Vec(nlp.vocab, model, name) class Tok2Vec(TrainablePipe): """Apply a "token-to-vector" model and set its outputs in the doc.tensor attribute. This is mostly useful to share a single subnetwork between multiple components, e.g. to have one embedding and CNN network shared between a parser, tagger and NER.
[model.tok2vec.embed] @architectures = "spacy.MultiHashEmbed.v1" width = 96 rows = [5000, 2000, 1000, 1000] attrs = ["ORTH", "PREFIX", "SUFFIX", "SHAPE"] include_static_vectors = false [model.tok2vec.encode] @architectures = "spacy.MaxoutWindowEncoder.v1" width = ${model.tok2vec.embed.width} window_size = 1 maxout_pieces = 3 depth = 4 """ DEFAULT_SPANCAT_MODEL = Config().from_str(spancat_default_config)["model"] @runtime_checkable class Suggester(Protocol): def __call__(self, docs: Iterable[Doc], *, ops: Optional[Ops] = None) -> Ragged: ... @registry.misc("spacy.ngram_suggester.v1") def build_ngram_suggester(sizes: List[int]) -> Suggester: """Suggest all spans of the given lengths. Spans are returned as a ragged array of integers. The array has two columns, indicating the start and end position.""" def ngram_suggester(docs: Iterable[Doc], *, ops: Optional[Ops] = None) -> Ragged:
from thinc.api import Config defaults = Config().from_str(""" [Yake] window = 2 lemmatize = false candidate_selection = ngram [TextRank] window = 3 alpha = 0.85 tol = 1.0e-6 candidate_selection = chunk [PositionRank] window = 10 alpha = 0.85 tol = 1.0e-5 normalize = false candidate_selection = chunk [TopicRank] clustering_method = average distance_metric = jaccard threshold = 0.74 alpha = 0.85 tol = 1.0e-6 heuristic = null candidate_selection = chunk """) @Language.factory("yake", default_config=defaults["Yake"])
def test_create_nlp_from_pretraining_config(): """Test that the default pretraining config validates properly""" config = Config().from_str(pretrain_config_string) pretrain_config = load_config(DEFAULT_CONFIG_PRETRAIN_PATH) filled = config.merge(pretrain_config) registry.resolve(filled["pretraining"], schema=ConfigSchemaPretrain)
default_model_config = """ [model] @architectures = "spacy.EntityLinker.v1" [model.tok2vec] @architectures = "spacy.HashEmbedCNN.v1" pretrained_vectors = null width = 96 depth = 2 embed_size = 2000 window_size = 1 maxout_pieces = 3 subword_features = true """ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"] @Language.factory( "entity_linker", requires=["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"], assigns=["token.ent_kb_id"], default_config={ "model": DEFAULT_NEL_MODEL, "labels_discard": [], "incl_prior": True, "incl_context": True, "entity_vector_length": 64, "get_candidates": { "@misc": "spacy.CandidateGenerator.v1" },
[transformer.set_extra_annotations] @annotation_setters = "spacy-transformers.null_annotation_setter.v1" [transformer.model] @architectures = "spacy-transformers.TransformerModel.v2" name = "roberta-base" tokenizer_config = {"use_fast": true} transformer_config = {} [transformer.model.get_spans] @span_getters = "spacy-transformers.strided_spans.v1" window = 128 stride = 96 """ DEFAULT_CONFIG = Config().from_str(DEFAULT_CONFIG_STR) DOC_EXT_ATTR = "trf_data" @Language.factory( "transformer", assigns=[f"doc._.{DOC_EXT_ATTR}"], default_config=DEFAULT_CONFIG["transformer"], ) def make_transformer( nlp: Language, name: str, model: Model[List[Doc], FullTransformerBatch], set_extra_annotations: Callable[[List[Doc], FullTransformerBatch], None], max_batch_items: int, ):