def test_given_text_with_pii_using_package_then_analyze_and_anonymize_complete_successfully(): text_to_test = "John Smith drivers license is AC432223" expected_response = [RecognizerResult("PERSON", 0, 10, 0.85), RecognizerResult("US_DRIVER_LICENSE", 30, 38, 0.6499999999999999) ] # Create configuration containing engine name and models configuration = { "nlp_engine_name": "spacy", "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}], } # Create NLP engine based on configuration provider = NlpEngineProvider(nlp_configuration=configuration) nlp_engine = provider.create_engine() # Pass the created NLP engine and supported_languages to the AnalyzerEngine analyzer = AnalyzerEngine( nlp_engine=nlp_engine, supported_languages=["en"] ) analyzer_results = analyzer.analyze(text_to_test, "en") for i in range(len(analyzer_results)): assert analyzer_results[i] == expected_response[i] expected_response = AnonymizerResult(text="<PERSON> drivers license is <US_DRIVER_LICENSE>") expected_response.add_item(AnonymizedEntity("replace", "US_DRIVER_LICENSE", 28, 47, "<US_DRIVER_LICENSE>")) expected_response.add_item(AnonymizedEntity("replace", "PERSON", 0, 8, "<PERSON>")) anonymizer = AnonymizerEngine() anonymizer_results = anonymizer.anonymize(text_to_test, analyzer_results) assert anonymizer_results == expected_response
def __init__(self, **data: Any): super().__init__(**data) if not self.engine_config: self.engine_config = PresidioEngineConfig() if not self.engine_config.models or len( self.engine_config.models) == 0: self.engine_config.models = [PresidioModelConfig()] # If spacy engine then load Spacy models and select languages languages = [] for model_config in self.engine_config.models: languages.append(model_config.lang_code) # Check SpacyNlpEngine.engine_name if (self.engine_config.nlp_engine_name == "spacy" and model_config.model_name is not None): try: spacy_model = __import__(model_config.model_name) spacy_model.load() logger.info( f"Spacy model {model_config.model_name} is already downloaded" ) except: logger.warning( f"Spacy model {model_config.model_name} is not downloaded" ) logger.warning( f"Downloading spacy model {model_config.model_name}, it might take some time" ) from spacy.cli import download download(model_config.model_name) # Create NLP engine based on configuration provider = NlpEngineProvider( nlp_configuration=self.engine_config.dict()) nlp_engine = provider.create_engine() # Pass the created NLP engine and supported_languages to the AnalyzerEngine self._analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=languages) # self._analyzer.registry.load_predefined_recognizers() if self.entity_recognizers: for entity_recognizer in self.entity_recognizers: self._analyzer.registry.add_recognizer(entity_recognizer) # Initialize the anonymizer with logger self._anonymizer = AnonymizerEngine()
def __init__( self, registry: RecognizerRegistry = None, nlp_engine: NlpEngine = None, app_tracer: AppTracer = None, log_decision_process: bool = False, default_score_threshold: float = 0, supported_languages: List[str] = None, ): if not supported_languages: supported_languages = ["en"] if not nlp_engine: logger.info("nlp_engine not provided, creating default.") provider = NlpEngineProvider() nlp_engine = provider.create_engine() if not registry: logger.info("registry not provided, creating default.") registry = RecognizerRegistry() if not app_tracer: app_tracer = AppTracer() self.app_tracer = app_tracer self.supported_languages = supported_languages self.nlp_engine = nlp_engine self.registry = registry # load all recognizers if not registry.recognizers: registry.load_predefined_recognizers( nlp_engine=self.nlp_engine, languages=self.supported_languages ) self.log_decision_process = log_decision_process self.default_score_threshold = default_score_threshold
def nlp_engine_provider(): return NlpEngineProvider()