Пример #1
0
def test_given_text_with_pii_using_package_then_analyze_and_anonymize_complete_successfully():
    text_to_test = "John Smith drivers license is AC432223"

    expected_response = [RecognizerResult("PERSON", 0, 10, 0.85),
                         RecognizerResult("US_DRIVER_LICENSE", 30, 38, 0.6499999999999999)
                         ]
    # Create configuration containing engine name and models
    configuration = {
        "nlp_engine_name": "spacy",
        "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
    }

    # Create NLP engine based on configuration
    provider = NlpEngineProvider(nlp_configuration=configuration)
    nlp_engine = provider.create_engine()

    # Pass the created NLP engine and supported_languages to the AnalyzerEngine
    analyzer = AnalyzerEngine(
        nlp_engine=nlp_engine,
        supported_languages=["en"]
    )
    analyzer_results = analyzer.analyze(text_to_test, "en")
    for i in range(len(analyzer_results)):
        assert analyzer_results[i] == expected_response[i]

    expected_response = AnonymizerResult(text="<PERSON> drivers license is <US_DRIVER_LICENSE>")
    expected_response.add_item(AnonymizedEntity("replace", "US_DRIVER_LICENSE", 28, 47, "<US_DRIVER_LICENSE>"))
    expected_response.add_item(AnonymizedEntity("replace", "PERSON", 0, 8, "<PERSON>"))

    anonymizer = AnonymizerEngine()
    anonymizer_results = anonymizer.anonymize(text_to_test, analyzer_results)
    assert anonymizer_results == expected_response
Пример #2
0
    def __init__(self, **data: Any):
        super().__init__(**data)

        if not self.engine_config:
            self.engine_config = PresidioEngineConfig()

        if not self.engine_config.models or len(
                self.engine_config.models) == 0:
            self.engine_config.models = [PresidioModelConfig()]

        # If spacy engine then load Spacy models and select languages
        languages = []
        for model_config in self.engine_config.models:
            languages.append(model_config.lang_code)

            # Check SpacyNlpEngine.engine_name
            if (self.engine_config.nlp_engine_name == "spacy"
                    and model_config.model_name is not None):
                try:
                    spacy_model = __import__(model_config.model_name)
                    spacy_model.load()
                    logger.info(
                        f"Spacy model {model_config.model_name} is already downloaded"
                    )
                except:
                    logger.warning(
                        f"Spacy model {model_config.model_name} is not downloaded"
                    )
                    logger.warning(
                        f"Downloading spacy model {model_config.model_name}, it might take some time"
                    )
                    from spacy.cli import download

                    download(model_config.model_name)

        # Create NLP engine based on configuration
        provider = NlpEngineProvider(
            nlp_configuration=self.engine_config.dict())
        nlp_engine = provider.create_engine()

        # Pass the created NLP engine and supported_languages to the AnalyzerEngine
        self._analyzer = AnalyzerEngine(nlp_engine=nlp_engine,
                                        supported_languages=languages)

        # self._analyzer.registry.load_predefined_recognizers()
        if self.entity_recognizers:
            for entity_recognizer in self.entity_recognizers:
                self._analyzer.registry.add_recognizer(entity_recognizer)

        # Initialize the anonymizer with logger
        self._anonymizer = AnonymizerEngine()
Пример #3
0
    def __init__(
        self,
        registry: RecognizerRegistry = None,
        nlp_engine: NlpEngine = None,
        app_tracer: AppTracer = None,
        log_decision_process: bool = False,
        default_score_threshold: float = 0,
        supported_languages: List[str] = None,
    ):
        if not supported_languages:
            supported_languages = ["en"]

        if not nlp_engine:
            logger.info("nlp_engine not provided, creating default.")
            provider = NlpEngineProvider()
            nlp_engine = provider.create_engine()

        if not registry:
            logger.info("registry not provided, creating default.")
            registry = RecognizerRegistry()
        if not app_tracer:
            app_tracer = AppTracer()
        self.app_tracer = app_tracer

        self.supported_languages = supported_languages

        self.nlp_engine = nlp_engine
        self.registry = registry

        # load all recognizers
        if not registry.recognizers:
            registry.load_predefined_recognizers(
                nlp_engine=self.nlp_engine, languages=self.supported_languages
            )

        self.log_decision_process = log_decision_process
        self.default_score_threshold = default_score_threshold
Пример #4
0
def nlp_engine_provider():
    return NlpEngineProvider()