def _add_custom_spacy_extensions(self): for n, f in self.is_attrs_name2func: ext = Token.get_extension(n) if ext is None: Token.set_extension(n, getter=f, force=True) for item in [Span, Doc]: for n, f in self.has_attrs_name2func: ext = item.get_extension(n) if ext is None: #print(f"Setting: {item}.set_extension({n}, getter= {f})") item.set_extension(n, getter=f, force=True) # Add Attr Getters for Span (i.e. Doc.ents) for n, f in self.get_attrs_name2func: ext = Span.get_extension(n) if ext is None: Span.set_extension(n, getter=f, force=True)
def __init__(self, nlp: Language, attr: str = "spaczz_ent", **cfg: Any) -> None: """Initialize the spaczz ruler with a Language object and cfg parameters. All spaczz ruler cfg parameters are prepended with "spaczz_". If spaczz_patterns is supplied here, they need to be a list of spaczz patterns: dictionaries with a "label", "pattern", "type", and optional "kwargs" key. For example: {'label': 'ORG', 'pattern': 'Apple', 'type': 'fuzzy', 'kwargs': {'min_r2': 90}}. Args: nlp: The shared nlp object to pass the vocab to the matchers (not currently used by spaczz matchers) and process fuzzy patterns. attr: Name of custom Span attribute that denotes whether an entity was added via the spaczz ruler or not. Default is "spaczz_ent". **cfg: Other config parameters. The SpaczzRuler makes heavy use of cfg to pass additional parameters down to the matchers. spaczz config parameters start with "spaczz_" to keep them from colliding with other cfg components. SpaczzRuler cfg components include (with "spaczz_" prepended to them): overwrite_ents (bool): Whether to overwrite exisiting Doc.ents with new matches. Default is False. ent_id_sep (str): String to separate entity labels and ids on. regex_config (Union[str, RegexConfig]): Config to use with the regex matcher. Default is "default". See RegexMatcher/RegexSearcher documentation for available parameter details. fuzzy_defaults (Dict[str, Any]): Modified default parameters to use with the fuzzy matcher. Default is an empty dictionary - utilizing defaults. regex_defaults (Dict[str, Any]): Modified default parameters to use with the regex matcher. Default is an empty dictionary - utilizing defaults. See RegexMatcher/RegexSearcher documentation for parameter details. patterns (Iterable[Dict[str, Any]]): Patterns to initialize the ruler with. Default is None. If SpaczzRuler is loaded as part of a model pipeline, cfg will include all keyword arguments passed to spacy.load. Raises: TypeError: If spaczz_{name}_defaults passed are not dictionaries. """ if not Span.get_extension(attr): Span.set_extension(attr, default=False) self.nlp = nlp self.fuzzy_patterns: DefaultDict[str, DefaultDict[ str, Any]] = defaultdict(lambda: defaultdict(list)) self.regex_patterns: DefaultDict[str, DefaultDict[ str, Any]] = defaultdict(lambda: defaultdict(list)) self.ent_id_sep = cfg.get("spaczz_ent_id_sep", DEFAULT_ENT_ID_SEP) self._ent_ids: Dict[Any, Any] = defaultdict(dict) self.overwrite = cfg.get("spaczz_overwrite_ents", False) default_names = ("spaczz_fuzzy_defaults", "spaczz_regex_defaults") self.defaults = {} for name in default_names: if name in cfg: if isinstance(cfg[name], dict): self.defaults[name] = cfg[name] else: raise TypeError(( "Defaults must be a dictionary of keyword arguments,", f"not {type(cfg[name])}.", )) self.fuzzy_matcher = FuzzyMatcher( nlp.vocab, **self.defaults.get("spaczz_fuzzy_defaults", {}), ) self.regex_matcher = RegexMatcher( nlp.vocab, cfg.get("spaczz_regex_config", "default"), **self.defaults.get("spaczz_regex_defaults", {}), ) patterns = cfg.get("spaczz_patterns") if patterns is not None: self.add_patterns(patterns)