def test_pipe_factories_language_specific(): """Test that language sub-classes can have their own factories, with fallbacks to the base factories.""" name1 = "specific_component1" name2 = "specific_component2" Language.component(name1, func=lambda: "base") English.component(name1, func=lambda: "en") German.component(name2, func=lambda: "de") assert Language.has_factory(name1) assert not Language.has_factory(name2) assert English.has_factory(name1) assert not English.has_factory(name2) assert German.has_factory(name1) assert German.has_factory(name2) nlp = Language() assert nlp.create_pipe(name1)() == "base" with pytest.raises(ValueError): nlp.create_pipe(name2) nlp_en = English() assert nlp_en.create_pipe(name1)() == "en" with pytest.raises(ValueError): nlp_en.create_pipe(name2) nlp_de = German() assert nlp_de.create_pipe(name1)() == "base" assert nlp_de.create_pipe(name2)() == "de"
def test_language_pipe_error_handler_input_as_tuples(en_vocab, n_process): """Test the error handling of nlp.pipe with input as tuples""" Language.component("my_evil_component", func=evil_component) ops = get_current_ops() if isinstance(ops, NumpyOps) or n_process < 2: nlp = English() nlp.add_pipe("my_evil_component") texts = [ ("TEXT 111", 111), ("TEXT 222", 222), ("TEXT 333", 333), ("TEXT 342", 342), ("TEXT 666", 666), ] with pytest.raises(ValueError): list(nlp.pipe(texts, as_tuples=True)) nlp.set_error_handler(warn_error) logger = logging.getLogger("spacy") with mock.patch.object(logger, "warning") as mock_warning: tuples = list(nlp.pipe(texts, as_tuples=True, n_process=n_process)) # HACK/TODO? the warnings in child processes don't seem to be # detected by the mock logger if n_process == 1: mock_warning.assert_called() assert mock_warning.call_count == 2 assert len(tuples) + mock_warning.call_count == len(texts) assert (tuples[0][0].text, tuples[0][1]) == ("TEXT 111", 111) assert (tuples[1][0].text, tuples[1][1]) == ("TEXT 333", 333) assert (tuples[2][0].text, tuples[2][1]) == ("TEXT 666", 666)
def test_language_pipe_error_handler_custom(en_vocab, n_process): """Test the error handling of a custom component that has no pipe method""" Language.component("my_evil_component", func=evil_component) ops = get_current_ops() if isinstance(ops, NumpyOps) or n_process < 2: nlp = English() nlp.add_pipe("my_evil_component") texts = ["TEXT 111", "TEXT 222", "TEXT 333", "TEXT 342", "TEXT 666"] with pytest.raises(ValueError): # the evil custom component throws an error list(nlp.pipe(texts)) nlp.set_error_handler(warn_error) logger = logging.getLogger("spacy") with mock.patch.object(logger, "warning") as mock_warning: # the errors by the evil custom component raise a warning for each # bad doc docs = list(nlp.pipe(texts, n_process=n_process)) # HACK/TODO? the warnings in child processes don't seem to be # detected by the mock logger if n_process == 1: mock_warning.assert_called() assert mock_warning.call_count == 2 assert len(docs) + mock_warning.call_count == len(texts) assert [doc.text for doc in docs] == ["TEXT 111", "TEXT 333", "TEXT 666"]
def add_meta_cat(self, meta_cat, name): component_name = spacy.util.get_object_name(meta_cat) Language.component(name=component_name, func=meta_cat) self.nlp.add_pipe(component_name, name=name, last=True) # Only the meta_anns field is needed, it will be a dictionary #of {category_name: value, ...} Span.set_extension('meta_anns', default=None, force=True)
def add_token_normalizer(self, config, spell_checker=None): token_normalizer = TokenNormalizer(spell_checker=spell_checker, config=config) component_name = spacy.util.get_object_name(token_normalizer) Language.component(name=component_name, func=token_normalizer) self.nlp.add_pipe(component_name, name='token_normalizer', last=True) # Add custom fields needed for this usecase Token.set_extension('norm', default=None, force=True)
def load(): nlp=Thai() if SPACY_V3: Language.component("thai_tagger",func=ThaiTagger(nlp)) nlp.add_pipe("thai_tagger") Language.component("thai_parser",func=ThaiParser(nlp)) nlp.add_pipe("thai_parser") else: nlp.add_pipe(ThaiTagger(nlp)) nlp.add_pipe(ThaiParser(nlp)) return nlp
def add_linker(self, linker): r''' Add entity linker to the pipeline, will also add the necessary fields to Span object. linker (object/function): Any object/function created based on the requirements for a spaCy pipeline components. Have a look at https://spacy.io/usage/processing-pipelines#custom-components ''' component_name = spacy.util.get_object_name(linker) Language.component(name=component_name, func=linker) self.nlp.add_pipe(component_name, name='cat_linker', last=True) Span.set_extension('cui', default=-1, force=True) Span.set_extension('context_similarity', default=-1, force=True)
def add_ner(self, ner): r''' Add NER from CAT to the pipeline, will also add the necessary fields to the document and Span objects. ''' component_name = spacy.util.get_object_name(ner) Language.component(name=component_name, func=ner) self.nlp.add_pipe(component_name, name='cat_ner', last=True) Doc.set_extension('ents', default=[], force=True) Span.set_extension('confidence', default=-1, force=True) Span.set_extension('id', default=0, force=True) # Do not set this property if a vocabulary apporach is not used, this name must #refer to a name2cuis in the cdb. Span.set_extension('detected_name', default=None, force=True) Span.set_extension('link_candidates', default=None, force=True)
def test_language_pipe_error_handler_pipe(en_vocab, n_process): """Test the error handling of a component's pipe method""" Language.component("my_perhaps_sentences", func=perhaps_set_sentences) Language.component("assert_sents_error", func=assert_sents_error) ops = get_current_ops() if isinstance(ops, NumpyOps) or n_process < 2: texts = [f"{str(i)} is enough. Done" for i in range(100)] nlp = English() nlp.add_pipe("my_perhaps_sentences") nlp.add_pipe("assert_sents_error") nlp.initialize() with pytest.raises(ValueError): # assert_sents_error requires sentence boundaries, will throw an error otherwise docs = list(nlp.pipe(texts, n_process=n_process, batch_size=10)) nlp.set_error_handler(ignore_error) docs = list(nlp.pipe(texts, n_process=n_process, batch_size=10)) # we lose/ignore the failing 4,40-49 docs assert len(docs) == 89
def build_pipeline(disable: list = []): """ Function that creates the pipeline for the creation of (sentence, reference) tuples. Returns: - nlp: spaCy pipeline instance """ nlp = load_spacy_model("de_core_news_sm") # Matching section references using ReferenceMatcher class Language.component("reference_matcher", func=match_reference) nlp.add_pipe("sentencizer") nlp.add_pipe("reference_matcher", before="tagger") nlp.disable_pipes(*disable) print("\nActivated pipes:") print(nlp.pipe_names) return nlp
def test_pipe_factories_decorator_idempotent(i, func, func2): """Check that decorator can be run multiple times if the function is the same. This is especially relevant for live reloading because we don't want spaCy to raise an error if a module registering components is reloaded. """ name = f"test_pipe_factories_decorator_idempotent_{i}" for i in range(5): Language.factory(name, func=func) nlp = Language() nlp.add_pipe(name) Language.factory(name, func=func) # Make sure it also works for component decorator, which creates the # factory function name2 = f"{name}2" for i in range(5): Language.component(name2, func=func2) nlp = Language() nlp.add_pipe(name) Language.component(name2, func=func2)
def test_update_with_annotates(): name = "test_with_annotates" results = {} def make_component(name): results[name] = "" def component(doc): nonlocal results results[name] += doc.text return doc return component Language.component(f"{name}1", func=make_component(f"{name}1")) Language.component(f"{name}2", func=make_component(f"{name}2")) components = set([f"{name}1", f"{name}2"]) nlp = English() texts = ["a", "bb", "ccc"] examples = [] for text in texts: examples.append(Example(nlp.make_doc(text), nlp.make_doc(text))) for components_to_annotate in [ [], [f"{name}1"], [f"{name}1", f"{name}2"], [f"{name}2", f"{name}1"], ]: for key in results: results[key] = "" nlp = English(vocab=nlp.vocab) nlp.add_pipe(f"{name}1") nlp.add_pipe(f"{name}2") nlp.update(examples, annotates=components_to_annotate) for component in components_to_annotate: assert results[component] == "".join(eg.predicted.text for eg in examples) for component in components - set(components_to_annotate): assert results[component] == ""
def nlp2(nlp, sample_vectors): Language.component("test_language_vector_modification_pipe", func=vector_modification_pipe) Language.component("test_language_userdata_pipe", func=userdata_pipe) Language.component("test_language_ner_pipe", func=ner_pipe) add_vecs_to_vocab(nlp.vocab, sample_vectors) nlp.add_pipe("test_language_vector_modification_pipe") nlp.add_pipe("test_language_ner_pipe") nlp.add_pipe("test_language_userdata_pipe") return nlp
def test_add_pipe_last(nlp, name1, name2): Language.component("new_pipe2", func=lambda doc: doc) nlp.add_pipe("new_pipe2", name=name2) nlp.add_pipe("new_pipe", name=name1, last=True) assert nlp.pipeline[0][0] != name1 assert nlp.pipeline[-1][0] == name1
def test_disable_enable_pipes(): name = "test_disable_enable_pipes" results = {} def make_component(name): results[name] = "" def component(doc): nonlocal results results[name] = doc.text return doc return component c1 = Language.component(f"{name}1", func=make_component(f"{name}1")) c2 = Language.component(f"{name}2", func=make_component(f"{name}2")) nlp = Language() nlp.add_pipe(f"{name}1") nlp.add_pipe(f"{name}2") assert results[f"{name}1"] == "" assert results[f"{name}2"] == "" assert nlp.pipeline == [(f"{name}1", c1), (f"{name}2", c2)] assert nlp.pipe_names == [f"{name}1", f"{name}2"] nlp.disable_pipe(f"{name}1") assert nlp.disabled == [f"{name}1"] assert nlp.component_names == [f"{name}1", f"{name}2"] assert nlp.pipe_names == [f"{name}2"] assert nlp.config["nlp"]["disabled"] == [f"{name}1"] nlp("hello") assert results[f"{name}1"] == "" # didn't run assert results[f"{name}2"] == "hello" # ran nlp.enable_pipe(f"{name}1") assert nlp.disabled == [] assert nlp.pipe_names == [f"{name}1", f"{name}2"] assert nlp.config["nlp"]["disabled"] == [] nlp("world") assert results[f"{name}1"] == "world" assert results[f"{name}2"] == "world" nlp.disable_pipe(f"{name}2") nlp.remove_pipe(f"{name}2") assert nlp.components == [(f"{name}1", c1)] assert nlp.pipeline == [(f"{name}1", c1)] assert nlp.component_names == [f"{name}1"] assert nlp.pipe_names == [f"{name}1"] assert nlp.disabled == [] assert nlp.config["nlp"]["disabled"] == [] nlp.rename_pipe(f"{name}1", name) assert nlp.components == [(name, c1)] assert nlp.component_names == [name] nlp("!") assert results[f"{name}1"] == "!" assert results[f"{name}2"] == "world" with pytest.raises(ValueError): nlp.disable_pipe(f"{name}2") nlp.disable_pipe(name) assert nlp.component_names == [name] assert nlp.pipe_names == [] assert nlp.config["nlp"]["disabled"] == [name] nlp("?") assert results[f"{name}1"] == "!"
def test_add_lots_of_pipes(nlp, n_pipes): Language.component("n_pipes", func=lambda doc: doc) for i in range(n_pipes): nlp.add_pipe("n_pipes", name=f"pipe_{i}") assert len(nlp.pipe_names) == n_pipes
meta_token.is_sent_start = False # The leading '=' is a sentence boundary doc[i - 9].is_sent_start = True # Any token following the metadata is also a new sentence. doc[i + 1].is_sent_start = True return doc NLP = spacy.load('nl_core_news_sm') try: NLP.add_pipe(_metadata_sentence_segmentation, before="parser") # Insert before the parser except ValueError: # spacy>=3 from spacy.language import Language Language.component('meta-sentence-segmentation')( _metadata_sentence_segmentation) # pylint: disable=E1101 NLP.add_pipe('meta-sentence-segmentation', before="parser") # Insert before the parser for case in TOKENIZER_SPECIAL_CASES: NLP.tokenizer.add_special_case(case, [{ORTH: case}]) NLP.tokenizer.add_special_case(case.lower(), [{ORTH: case.lower()}]) infixes = NLP.Defaults.infixes + [r'\(', r'\)', r'(?<=[\D])\/(?=[\D])'] infix_regex = spacy.util.compile_infix_regex(infixes) NLP.tokenizer.infix_finditer = infix_regex.finditer class TokenizerOns(Tokenizer): def parse_text(self, text: str) -> spacy.tokens.doc.Doc: """Custom spacy tokenizer for the 'ons' corpus that takes care of special metadata tokens.