예제 #1
0
    def test_added_pattern_recognizer_works(self):
        pattern = Pattern("rocket pattern", r'\W*(rocket)\W*', 0.8)
        pattern_recognizer = PatternRecognizer("ROCKET",
                                               name="Rocket recognizer",
                                               patterns=[pattern])

        # Make sure the analyzer doesn't get this entity
        recognizers_store_api_mock = RecognizerStoreApiMock()
        analyze_engine = AnalyzerEngine(
            registry=MockRecognizerRegistry(recognizers_store_api_mock),
            nlp_engine=MockNlpEngine())
        text = "rocket is my favorite transportation"
        entities = ["CREDIT_CARD", "ROCKET"]

        results = analyze_engine.analyze(self.unit_test_guid,
                                         text=text,
                                         entities=entities,
                                         language='en',
                                         all_fields=False)

        assert len(results) == 0

        # Add a new recognizer for the word "rocket" (case insensitive)
        recognizers_store_api_mock.add_custom_pattern_recognizer(
            pattern_recognizer)

        # Check that the entity is recognized:
        results = analyze_engine.analyze(self.unit_test_guid,
                                         text=text,
                                         entities=entities,
                                         language='en',
                                         all_fields=False)

        assert len(results) == 1
        assert_result(results[0], "ROCKET", 0, 7, 0.8)
예제 #2
0
def test_simple():
    dic = {
        "text": "John Smith drivers license is AC432223",
        "language": "en",
        "score_threshold": 0.7,
        "entities": ["CRYPTO", "NRP", "DATE_TIME", "LOCATION", "PERSON"],
    }

    analyzer = AnalyzerEngine()
    analyzer.analyze(**dic)
def test_removed_pattern_recognizer_doesnt_work(unit_test_guid):
    pattern = Pattern("spaceship pattern", r"\W*(spaceship)\W*", 0.8)
    pattern_recognizer = PatternRecognizer("SPACESHIP",
                                           name="Spaceship recognizer",
                                           patterns=[pattern])

    # Make sure the analyzer doesn't get this entity
    recognizers_store_api_mock = RecognizerStoreApiMock()
    analyze_engine = AnalyzerEngine(
        registry=MockRecognizerRegistry(recognizers_store_api_mock),
        nlp_engine=NlpEngineMock(),
    )
    text = "spaceship is my favorite transportation"
    entities = ["CREDIT_CARD", "SPACESHIP"]

    results = analyze_engine.analyze(
        correlation_id=unit_test_guid,
        text=text,
        entities=entities,
        language="en",
        all_fields=False,
    )

    assert len(results) == 0

    # Add a new recognizer for the word "rocket" (case insensitive)
    recognizers_store_api_mock.add_custom_pattern_recognizer(
        pattern_recognizer)
    # Check that the entity is recognized:
    results = analyze_engine.analyze(
        correlation_id=unit_test_guid,
        text=text,
        entities=entities,
        language="en",
        all_fields=False,
    )
    assert len(results) == 1
    assert_result(results[0], "SPACESHIP", 0, 10, 0.8)

    # Remove recognizer
    recognizers_store_api_mock.remove_recognizer("Spaceship recognizer")
    # Test again to see we didn't get any results
    results = analyze_engine.analyze(
        correlation_id=unit_test_guid,
        text=text,
        entities=entities,
        language="en",
        all_fields=False,
    )

    assert len(results) == 0
예제 #4
0
def test_given_text_with_pii_using_package_then_analyze_and_anonymize_complete_successfully():
    text_to_test = "John Smith drivers license is AC432223"

    expected_response = [RecognizerResult("PERSON", 0, 10, 0.85),
                         RecognizerResult("US_DRIVER_LICENSE", 30, 38, 0.6499999999999999)
                         ]
    # Create configuration containing engine name and models
    configuration = {
        "nlp_engine_name": "spacy",
        "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
    }

    # Create NLP engine based on configuration
    provider = NlpEngineProvider(nlp_configuration=configuration)
    nlp_engine = provider.create_engine()

    # Pass the created NLP engine and supported_languages to the AnalyzerEngine
    analyzer = AnalyzerEngine(
        nlp_engine=nlp_engine,
        supported_languages=["en"]
    )
    analyzer_results = analyzer.analyze(text_to_test, "en")
    for i in range(len(analyzer_results)):
        assert analyzer_results[i] == expected_response[i]

    expected_response = AnonymizerResult(text="<PERSON> drivers license is <US_DRIVER_LICENSE>")
    expected_response.add_item(AnonymizedEntity("replace", "US_DRIVER_LICENSE", 28, 47, "<US_DRIVER_LICENSE>"))
    expected_response.add_item(AnonymizedEntity("replace", "PERSON", 0, 8, "<PERSON>"))

    anonymizer = AnonymizerEngine()
    anonymizer_results = anonymizer.anonymize(text_to_test, analyzer_results)
    assert anonymizer_results == expected_response
def test_when_analyze_then_apptracer_has_value(loaded_registry, unit_test_guid,
                                               nlp_engine):
    text = "My name is Bart Simpson, and Credit card: 4095-2609-9393-4932,  my phone is 425 8829090"  # noqa E501
    language = "en"
    entities = ["CREDIT_CARD", "PHONE_NUMBER", "PERSON"]
    app_tracer_mock = AppTracerMock(enable_interpretability=True)
    analyzer_engine_with_spacy = AnalyzerEngine(
        loaded_registry,
        app_tracer=app_tracer_mock,
        enable_trace_pii=True,
        nlp_engine=nlp_engine,
    )
    results = analyzer_engine_with_spacy.analyze(
        correlation_id=unit_test_guid,
        text=text,
        entities=entities,
        language=language,
        all_fields=False,
        trace=True,
    )
    assert len(results) == 3
    for result in results:
        assert result.analysis_explanation is not None
    assert app_tracer_mock.get_msg_counter() == 2
    assert app_tracer_mock.get_last_trace() is not None
예제 #6
0
class HansardCleaner:
    def initialize(self):
        SpacyRecognizer.ENTITIES = ["PERSON"]
        Replace.NEW_VALUE = 'replace_text'
        nlp_engine = SpacyNlpEngine()
        nlp_engine.nlp['en'] = spacy.load('en_core_web_lg', disable=["parser", "tagger", "lemmatizer"])

        self.analyzer_engine = AnalyzerEngine(nlp_engine=nlp_engine)
        self.anonymizer_engine = AnonymizerEngine()

    def run(self, xml_filename):
        etree = ET.parse(xml_filename)

        for elem in etree.iter():
            text = elem.text.strip()
            if text:
                results = self.analyzer_engine.analyze(correlation_id=0,
                                          text = text,
                                          entities=[],
                                          language='en',
                                          score_threshold=0.5)
                if results:
                    elem.text = run_anonymizer(self.anonymizer_engine, text, results)

        new_filename = f'out-{xml_filename}'
        etree.write(new_filename)
        return new_filename
class HansardTextFormatter:
    def __init__(self):
        SpacyRecognizer.ENTITIES = ["PERSON"]
        Replace.NEW_VALUE = 'replace_text'
        nlp_engine = SpacyNlpEngine()
        nlp_engine.nlp['en'] = spacy.load(
            'en_core_web_lg', disable=["parser", "tagger", "lemmatizer"])

        self.analyzer_engine = AnalyzerEngine(nlp_engine=nlp_engine)
        self.anonymizer_engine = AnonymizerEngine()

    def run_anonymizer(self, text):
        results = self.analyzer_engine.analyze(text=text,
                                               entities=[],
                                               language='en',
                                               score_threshold=0.5)
        if results:
            config = {
                "PERSON":
                AnonymizerConfig("replace", {"replace_text": "[GDPRREDACT]"})
            }
            return self.anonymizer_engine.anonymize(text, results, config)

    @staticmethod
    def clean_text(text):
        text = text.replace('\n', '')
        text = text.replace('<BR />', '\n')
        return text

    def run_formatter(self, text):
        anon_text = self.run_anonymizer(text)
        cleaned_text = self.clean_text(anon_text)
        return cleaned_text
예제 #8
0
    def test_demo_text(self):
        text = "Here are a few examples sentences we currently support:\n\n" \
               "Hello, my name is David Johnson and I live in Maine.\n" \
               "My credit card number is 4095-2609-9393-4932 and my " \
               "Crypto wallet id is 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ.\n\n" \
               "On September 18 I visited microsoft.com and sent an " \
               "email to [email protected],  from the IP 192.168.0.1.\n\n" \
               "My passport: 991280345 and my phone number: (212) 555-1234.\n\n" \
               "Please transfer using this IBAN IL150120690000003111111.\n\n" \
               "Can you please check the status on bank account 954567876544 " \
               "in PresidiBank?\n\n" \
               "" \
               "Kate's social security number is 078-05-1120.  " \
               "Her driver license? it is 9234567B.\n\n" \
               "" \
               "This project welcomes contributions and suggestions.\n" \
               "Most contributions require you to agree to a " \
               "Contributor License Agreement (CLA) declaring " \
               "that you have the right to, and actually do, " \
               "grant us the rights to use your contribution. " \
               "For details, visit https://cla.microsoft.com " \
               "When you submit a pull request, " \
               "a CLA-bot will automatically determine whether " \
               "you need to provide a CLA and decorate the PR " \
               "appropriately (e.g., label, comment).\n" \
               "Simply follow the instructions provided by the bot. " \
               "You will only need to do this once across all repos using our CLA.\n" \
               "This project has adopted the Microsoft Open Source Code of Conduct.\n" \
               "For more information see the Code of Conduct FAQ or " \
               "contact [email protected] with any additional questions or comments."

        language = "en"

        analyzer_engine = AnalyzerEngine(default_score_threshold=0.35, nlp_engine=loaded_spacy_nlp_engine)
        results = analyzer_engine.analyze(correlation_id=self.unit_test_guid, text=text, entities=None,
                                          language=language, all_fields=True)
        for result in results:
            logger.info("Entity = {}, Text = {}, Score={}, Start={}, End={}".format(result.entity_type,
                                                                                    text[result.start:result.end],
                                                                                    result.score,
                                                                                    result.start, result.end))
        detected_entities = [result.entity_type for result in results]

        assert len([entity for entity in detected_entities if entity == "CREDIT_CARD"]) == 1
        assert len([entity for entity in detected_entities if entity == "CRYPTO"]) == 1
        assert len([entity for entity in detected_entities if entity == "DATE_TIME"]) == 1
        assert len([entity for entity in detected_entities if entity == "DOMAIN_NAME"]) == 4
        assert len([entity for entity in detected_entities if entity == "EMAIL_ADDRESS"]) == 2
        assert len([entity for entity in detected_entities if entity == "IBAN_CODE"]) == 1
        assert len([entity for entity in detected_entities if entity == "IP_ADDRESS"]) == 1
        assert len([entity for entity in detected_entities if entity == "LOCATION"]) == 1
        assert len([entity for entity in detected_entities if entity == "PERSON"]) == 2
        assert len([entity for entity in detected_entities if entity == "PHONE_NUMBER"]) == 1
        assert len([entity for entity in detected_entities if entity == "US_BANK_NUMBER"]) == 1
        assert len([entity for entity in detected_entities if entity == "US_DRIVER_LICENSE"]) == 1
        assert len([entity for entity in detected_entities if entity == "US_PASSPORT"]) == 1
        assert len([entity for entity in detected_entities if entity == "US_SSN"]) == 1

        assert len(results) == 19
예제 #9
0
def test_when_analyze_two_entities_embedded_then_return_results(nlp_engine):
    analyzer = AnalyzerEngine(nlp_engine=nlp_engine)

    # Name with driver license in it
    text = "My name is John 1234567 Doe"
    results = analyzer.analyze(text=text, language="en", score_threshold=0)

    # currently we only remove duplicates when the two have the same entity type
    assert len(results) == 2
예제 #10
0
def test_ad_hoc_with_context_support_higher_confidence(nlp_engine,
                                                       zip_code_recognizer):
    text = "Mr. John Smith's zip code is 10023"
    analyzer_engine = AnalyzerEngine(nlp_engine=nlp_engine)

    responses1 = analyzer_engine.analyze(
        text=text, language="en", ad_hoc_recognizers=[zip_code_recognizer])

    zip_code_recognizer.context = ["zip", "code"]
    responses2 = analyzer_engine.analyze(
        text=text, language="en", ad_hoc_recognizers=[zip_code_recognizer])

    zip_result_no_context = [
        resp for resp in responses1 if resp.entity_type == "ZIP"
    ]
    zip_result_with_context = [
        resp for resp in responses2 if resp.entity_type == "ZIP"
    ]

    assert zip_result_no_context[0].score < zip_result_with_context[0].score
예제 #11
0
class Presidio:
    def __init__(self):
        self.analyzer = AnalyzerEngine()
        self.anonymizer = AnonymizerEngine()

    def analyze_and_anonymize(self, text) -> str:
        analyzer_results = self.analyzer.analyze(text=text, language='en')
        operators = {"DEFAULT": OperatorConfig("redact")}
        anonymizer_results = self.anonymizer.anonymize(
            text=text, analyzer_results=analyzer_results, operators=operators)

        return anonymizer_results.text
def anonymize_text(text: str) -> str:
    analyzer = AnalyzerEngine()
    anonymizer = AnonymizerEngine()
    analyzer_results = analyzer.analyze(text=text, language="en")
    anonymized_results = anonymizer.anonymize(
        text=text,
        analyzer_results=analyzer_results,
        anonymizers_config={
            "DEFAULT": AnonymizerConfig("replace",
                                        {"new_value": "<ANONYMIZED>"})
        },
    )
    return anonymized_results
예제 #13
0
def test_when_analyze_added_pattern_recognizer_then_succeed(unit_test_guid):
    pattern = Pattern("rocket pattern", r"\W*(rocket)\W*", 0.8)
    pattern_recognizer = PatternRecognizer("ROCKET",
                                           name="Rocket recognizer",
                                           patterns=[pattern])

    mock_recognizer_registry = RecognizerRegistryMock()

    # Make sure the analyzer doesn't get this entity
    analyze_engine = AnalyzerEngine(
        registry=mock_recognizer_registry,
        nlp_engine=NlpEngineMock(),
    )
    text = "rocket is my favorite transportation"
    entities = ["CREDIT_CARD", "ROCKET"]

    results = analyze_engine.analyze(
        correlation_id=unit_test_guid,
        text=text,
        entities=entities,
        language="en",
    )

    assert len(results) == 0

    # Add a new recognizer for the word "rocket" (case insensitive)
    mock_recognizer_registry.add_recognizer(pattern_recognizer)

    # Check that the entity is recognized:
    results = analyze_engine.analyze(
        correlation_id=unit_test_guid,
        text=text,
        entities=entities,
        language="en",
    )

    assert len(results) == 1
    assert_result(results[0], "ROCKET", 0, 7, 0.8)
예제 #14
0
def test_when_entities_is_none_all_recognizers_loaded_then_return_all_fields(
    nlp_engine, ):
    analyze_engine = AnalyzerEngine(registry=RecognizerRegistry(),
                                    nlp_engine=nlp_engine)
    threshold = 0
    text = "My name is Sharon and I live in Seattle." "Domain: microsoft.com "
    response = analyze_engine.analyze(text=text,
                                      score_threshold=threshold,
                                      language="en")
    returned_entities = [response.entity_type for response in response]

    assert response is not None
    assert "PERSON" in returned_entities
    assert "LOCATION" in returned_entities
    assert "DOMAIN_NAME" in returned_entities
예제 #15
0
def test_when_entities_is_none_then_return_all_fields(loaded_registry):
    analyze_engine = AnalyzerEngine(registry=loaded_registry,
                                    nlp_engine=NlpEngineMock())
    threshold = 0
    text = (" Credit card: 4095-2609-9393-4932,  my phone is 425 8829090 "
            "Domain: microsoft.com")
    response = analyze_engine.analyze(text=text,
                                      score_threshold=threshold,
                                      language="en")
    returned_entities = [response.entity_type for response in response]

    assert response is not None
    assert "CREDIT_CARD" in returned_entities
    assert "PHONE_NUMBER" in returned_entities
    assert "DOMAIN_NAME" in returned_entities
예제 #16
0
    def analyze(self, image: object, **kwargs) -> List[ImageRecognizerResult]:
        """Analyse method to analyse the given image.

        :param image: PIL Image/numpy array or file path(str) to be processed

        :return: list of the extract entities with image bounding boxes
        """
        ocr_result = OCR().perform_ocr(image)
        text = OCR().get_text_from_ocr_dict(ocr_result)

        analyzer = AnalyzerEngine()
        analyzer_result = analyzer.analyze(text=text, language="en", **kwargs)
        bboxes = self.map_analyzer_results_to_bounding_boxes(
            analyzer_result, ocr_result, text)
        return bboxes
예제 #17
0
    def test_when_default_threshold_is_zero_all_results_pass(self):
        text = " Credit card: 4095-2609-9393-4932,  my phone is 425 8829090"
        language = "en"
        entities = ["CREDIT_CARD", "PHONE_NUMBER"]

        # This analyzer engine is different from the global one, as this one
        # also loads SpaCy so it can detect the phone number entity

        analyzer_engine = AnalyzerEngine(
            registry=self.loaded_registry, nlp_engine=MockNlpEngine())
        results = analyzer_engine.analyze(self.unit_test_guid, text,
                                          entities, language,
                                          all_fields=False)

        assert len(results) == 2
예제 #18
0
def anonymize_text(text: str) -> str:
    try:
        analyzer = AnalyzerEngine()
        anonymizer = AnonymizerEngine()
        analyzer_results = analyzer.analyze(text=text, language="en")
        anonymized_results = anonymizer.anonymize(
            text=text,
            analyzer_results=analyzer_results,
            operators={
                "DEFAULT":
                AnonymizerConfig("replace", {"new_value": "<ANONYMIZED>"})
            },
        )
        return anonymized_results.text
    except Exception as e:
        print(f"An exception occurred. {e}")
예제 #19
0
class PDM:
    def __init__(self, language='en'):
        self.analyzer = AnalyzerEngine()
        self.anonymizer = AnonymizerEngine()
        self.language = language

    def predict(self, text, entities_of_interest=ENTITIES_OF_INTEREST):
        t0 = time()
        analyzer_results = self.analyzer.analyze(text, entities=entities_of_interest, language=self.language)
        t1 = time()
        anonymized_results = self.anonymizer.anonymize(text=text, analyzer_results=analyzer_results)
        t2 = time()
        results = {'time_to_analyze': f'{t1-t0:.4f} seconds',
                   'time_to_anonymize': f'{t2-t1:.4f} seconds',
                   'anonymized_text': anonymized_results.text,
                   'detected_items': [{'start': item.start, 'end': item.end, 'entity_type': item.entity_type} for item in anonymized_results.items]}
        return results
예제 #20
0
 def test_when_analyze_then_apptracer_has_value(self):
     text = "My name is Bart Simpson, and Credit card: 4095-2609-9393-4932,  my phone is 425 8829090"
     language = "en"
     entities = ["CREDIT_CARD", "PHONE_NUMBER", "PERSON"]
     analyzer_engine_with_spacy = AnalyzerEngine(self.loaded_registry,
                                                 app_tracer=self.app_tracer,
                                                 enable_trace_pii=True,
                                                 nlp_engine=TESTS_NLP_ENGINE)
     results = analyzer_engine_with_spacy.analyze(correlation_id=self.unit_test_guid,
                                                  text=text,
                                                  entities=entities,
                                                  language=language,
                                                  all_fields=False,
                                                  trace=True)
     assert len(results) == 3
     for result in results:
         assert result.analysis_explanation is not None
     assert self.app_tracer.get_msg_counter() == 2
     assert self.app_tracer.get_last_trace() is not None
예제 #21
0
def test_when_threshold_is_more_than_half_then_only_credit_card_passes(
        loaded_registry, unit_test_guid):
    text = " Credit card: 4095-2609-9393-4932,  my phone is 425 8829090"
    language = "en"
    entities = ["CREDIT_CARD", "PHONE_NUMBER"]

    # This analyzer engine is different from the global one, as this one
    # also loads SpaCy so it can detect the phone number entity

    analyzer_engine = AnalyzerEngine(registry=loaded_registry,
                                     nlp_engine=NlpEngineMock())
    results = analyzer_engine.analyze(
        correlation_id=unit_test_guid,
        text=text,
        entities=entities,
        language=language,
        score_threshold=0.51,
    )

    assert len(results) == 1
예제 #22
0
def test_when_add_recognizer_then_also_outputs_others(nlp_engine):
    pattern = Pattern("rocket pattern", r"\W*(rocket)\W*", 0.8)
    pattern_recognizer = PatternRecognizer(
        "ROCKET",
        name="Rocket recognizer",
        patterns=[pattern],
        supported_language="en",
    )
    registry = RecognizerRegistry()
    registry.add_recognizer(pattern_recognizer)
    registry.load_predefined_recognizers()

    assert len(registry.recognizers) > 1

    analyzer = AnalyzerEngine(registry=registry, nlp_engine=nlp_engine)

    text = "Michael Jones has a rocket"

    results = analyzer.analyze(text=text, language="en")
    assert len(results) == 2
예제 #23
0
    def test_analyze_with_multiple_predefined_recognizers(self):
        text = " Credit card: 4095-2609-9393-4932,  my phone is 425 8829090"
        language = "en"
        entities = ["CREDIT_CARD", "PHONE_NUMBER"]

        # This analyzer engine is different from the global one, as this one
        # also loads SpaCy so it can use the context words

        analyzer_engine_with_spacy = AnalyzerEngine(
            registry=self.loaded_registry, nlp_engine=loaded_spacy_nlp_engine)
        results = analyzer_engine_with_spacy.analyze(self.unit_test_guid, text,
                                                     entities, language,
                                                     all_fields=False)

        assert len(results) == 2
        assert_result(results[0], "CREDIT_CARD", 14,
                      33, EntityRecognizer.MAX_SCORE)
        expected_score = UsPhoneRecognizer.MEDIUM_REGEX_SCORE + \
                         PatternRecognizer.CONTEXT_SIMILARITY_FACTOR  # 0.5 + 0.35 = 0.85
        assert_result(results[1], "PHONE_NUMBER", 48, 59, expected_score)
예제 #24
0
def test_when_analyze_with_multiple_predefined_recognizers_then_succeed(
        loaded_registry, unit_test_guid, nlp_engine, max_score):
    text = " Credit card: 4095-2609-9393-4932,  my phone is 425 8829090"
    language = "en"
    entities = ["CREDIT_CARD", "PHONE_NUMBER"]

    analyzer_engine_with_spacy = AnalyzerEngine(registry=loaded_registry,
                                                nlp_engine=nlp_engine)
    results = analyzer_engine_with_spacy.analyze(
        correlation_id=unit_test_guid,
        text=text,
        entities=entities,
        language=language,
    )

    assert len(results) == 2
    medium_regex_score = 0.5  # see UsPhoneRecognizer.PATTERNS
    context_similarity_factor = 0.35  # PatternRecognizer.CONTEXT_SIMILARITY_FACTOR
    assert_result(results[0], "CREDIT_CARD", 14, 33, max_score)
    expected_score = medium_regex_score + context_similarity_factor
    assert_result(results[1], "PHONE_NUMBER", 48, 59, expected_score)
예제 #25
0
class PresidioPIIAnalyzer(BaseAnalyzer):
    _analyzer: AnalyzerEngine = PrivateAttr()
    _anonymizer: AnonymizerEngine = PrivateAttr()
    TYPE: str = "PresidioPII"
    engine_config: Optional[PresidioEngineConfig] = None
    # To see list of supported entities refer https://microsoft.github.io/presidio/supported_entities/
    # To add customer recognizers refer https://microsoft.github.io/presidio/analyzer/adding_recognizers/
    entity_recognizers: Optional[List[EntityRecognizer]] = None
    # To find more details refer https://microsoft.github.io/presidio/anonymizer/
    anonymizers_config: Optional[Dict[str, OperatorConfig]] = None

    def __init__(self, **data: Any):
        super().__init__(**data)

        if not self.engine_config:
            self.engine_config = PresidioEngineConfig()

        # If spacy engine then load Spacy models and select languages
        languages = []
        for model_config in self.engine_config.models:
            languages.append(model_config.lang_code)

            # Check SpacyNlpEngine.engine_name
            if self.engine_config.nlp_engine_name == "spacy":
                try:
                    spacy_model = __import__(model_config.model_name)
                    spacy_model.load()
                    logger.info(
                        f"Spacy model {model_config.model_name} is already downloaded"
                    )
                except:
                    logger.warning(
                        f"Spacy model {model_config.model_name} is not downloaded"
                    )
                    logger.warning(
                        f"Downloading spacy model {model_config.model_name}, it might take some time"
                    )
                    from spacy.cli import download

                    download(model_config.model_name)

        # Create NLP engine based on configuration
        provider = NlpEngineProvider(
            nlp_configuration=self.engine_config.dict())
        nlp_engine = provider.create_engine()

        # Pass the created NLP engine and supported_languages to the AnalyzerEngine
        self._analyzer = AnalyzerEngine(nlp_engine=nlp_engine,
                                        supported_languages=languages)

        # self._analyzer.registry.load_predefined_recognizers()
        if self.entity_recognizers:
            for entity_recognizer in self.entity_recognizers:
                self._analyzer.registry.add_recognizer(entity_recognizer)

        # Initialize the anonymizer with logger
        self._anonymizer = AnonymizerEngine()

    def analyze_input(
        self,
        source_response_list: List[AnalyzerRequest],
        analyzer_config: PresidioPIIAnalyzerConfig,
        language: Optional[str] = "en",
        **kwargs,
    ) -> List[AnalyzerResponse]:
        analyzer_output: List[AnalyzerResponse] = []

        for source_response in source_response_list:
            analyzer_result = self._analyzer.analyze(
                text=source_response.processed_text,
                entities=analyzer_config.entities,
                return_decision_process=analyzer_config.
                return_decision_process,
                language=language,
            )

            anonymized_result = None
            if not analyzer_config.analyze_only:
                anonymizers_config = (analyzer_config.anonymizers_config
                                      or self.anonymizers_config)

                if (source_response.processed_text is not None
                        and len(source_response.processed_text) > 0):
                    anonymized_result = self._anonymizer.anonymize(
                        text=source_response.processed_text,
                        operators=anonymizers_config,
                        analyzer_results=analyzer_result,
                    )

            if analyzer_config.replace_original_text and anonymized_result is not None:
                text = anonymized_result.text
            else:
                text = source_response.processed_text

            analyzer_output.append(
                AnalyzerResponse(
                    processed_text=text,
                    meta=source_response.meta,
                    segmented_data={
                        "analyzer_result":
                        [vars(result) for result in analyzer_result],
                        "anonymized_result":
                        None if not anonymized_result else
                        [vars(item) for item in anonymized_result.items],
                        "anonymized_text":
                        None
                        if not anonymized_result else anonymized_result.text,
                    },
                    source_name=source_response.source_name,
                ))

        return analyzer_output
예제 #26
0
class TestAnalyzerEngine(TestCase):
    def __init__(self, *args, **kwargs):
        super(TestAnalyzerEngine, self).__init__(*args, **kwargs)
        self.loaded_registry = MockRecognizerRegistry(RecognizerStoreApiMock())
        mock_nlp_artifacts = NlpArtifacts([], [], [], [], None, "en")
        self.app_tracer = AppTracerMock(enable_interpretability=True)
        self.loaded_analyzer_engine = AnalyzerEngine(
            self.loaded_registry,
            MockNlpEngine(stopwords=[],
                          punct_words=[],
                          nlp_artifacts=mock_nlp_artifacts),
            app_tracer=self.app_tracer,
            enable_trace_pii=True)
        self.unit_test_guid = "00000000-0000-0000-0000-000000000000"

    def test_analyze_with_predefined_recognizers_return_results(self):
        text = " Credit card: 4095-2609-9393-4932,  my phone is 425 8829090"
        language = "en"
        entities = ["CREDIT_CARD"]
        results = self.loaded_analyzer_engine.analyze(self.unit_test_guid,
                                                      text,
                                                      entities,
                                                      language,
                                                      all_fields=False)

        assert len(results) == 1
        assert_result(results[0], "CREDIT_CARD", 14, 33,
                      EntityRecognizer.MAX_SCORE)

    def test_analyze_with_multiple_predefined_recognizers(self):
        text = " Credit card: 4095-2609-9393-4932,  my phone is 425 8829090"
        language = "en"
        entities = ["CREDIT_CARD", "PHONE_NUMBER"]

        # This analyzer engine is different from the global one, as this one
        # also loads SpaCy so it can use the context words

        analyzer_engine_with_spacy = AnalyzerEngine(
            registry=self.loaded_registry, nlp_engine=loaded_spacy_nlp_engine)
        results = analyzer_engine_with_spacy.analyze(self.unit_test_guid,
                                                     text,
                                                     entities,
                                                     language,
                                                     all_fields=False)

        assert len(results) == 2
        assert_result(results[0], "CREDIT_CARD", 14, 33,
                      EntityRecognizer.MAX_SCORE)
        expected_score = UsPhoneRecognizer.MEDIUM_REGEX_SCORE + \
                         PatternRecognizer.CONTEXT_SIMILARITY_FACTOR  # 0.5 + 0.35 = 0.85
        assert_result(results[1], "PHONE_NUMBER", 48, 59, expected_score)

    def test_analyze_without_entities(self):
        with pytest.raises(ValueError):
            language = "en"
            text = " Credit card: 4095-2609-9393-4932,  my name is  John Oliver, DateTime: September 18 Domain: microsoft.com"
            entities = []
            self.loaded_analyzer_engine.analyze(self.unit_test_guid,
                                                text,
                                                entities,
                                                language,
                                                all_fields=False)

    def test_analyze_with_empty_text(self):
        language = "en"
        text = ""
        entities = ["CREDIT_CARD", "PHONE_NUMBER"]
        results = self.loaded_analyzer_engine.analyze(self.unit_test_guid,
                                                      text,
                                                      entities,
                                                      language,
                                                      all_fields=False)

        assert len(results) == 0

    def test_analyze_with_unsupported_language(self):
        with pytest.raises(ValueError):
            language = "de"
            text = ""
            entities = ["CREDIT_CARD", "PHONE_NUMBER"]
            self.loaded_analyzer_engine.analyze(self.unit_test_guid,
                                                text,
                                                entities,
                                                language,
                                                all_fields=False)

    def test_remove_duplicates(self):
        # test same result with different score will return only the highest
        arr = [
            RecognizerResult(start=0,
                             end=5,
                             score=0.1,
                             entity_type="x",
                             analysis_explanation=AnalysisExplanation(
                                 recognizer='test',
                                 original_score=0,
                                 pattern_name='test',
                                 pattern='test',
                                 validation_result=None)),
            RecognizerResult(start=0,
                             end=5,
                             score=0.5,
                             entity_type="x",
                             analysis_explanation=AnalysisExplanation(
                                 recognizer='test',
                                 original_score=0,
                                 pattern_name='test',
                                 pattern='test',
                                 validation_result=None))
        ]
        results = AnalyzerEngine._AnalyzerEngine__remove_duplicates(arr)
        assert len(results) == 1
        assert results[0].score == 0.5
        # TODO: add more cases with bug:
        # bug# 597: Analyzer remove duplicates doesn't handle all cases of one result as a substring of the other

    def test_remove_duplicates_different_entity_no_removal(self):
        # test same result with different score will return only the highest
        arr = [
            RecognizerResult(start=0,
                             end=5,
                             score=0.1,
                             entity_type="x",
                             analysis_explanation=AnalysisExplanation(
                                 recognizer='test',
                                 original_score=0,
                                 pattern_name='test',
                                 pattern='test',
                                 validation_result=None)),
            RecognizerResult(start=0,
                             end=5,
                             score=0.5,
                             entity_type="y",
                             analysis_explanation=AnalysisExplanation(
                                 recognizer='test',
                                 original_score=0,
                                 pattern_name='test',
                                 pattern='test',
                                 validation_result=None))
        ]
        results = AnalyzerEngine._AnalyzerEngine__remove_duplicates(arr)
        assert len(results) == 2

    def test_added_pattern_recognizer_works(self):
        pattern = Pattern("rocket pattern", r'\W*(rocket)\W*', 0.8)
        pattern_recognizer = PatternRecognizer("ROCKET",
                                               name="Rocket recognizer",
                                               patterns=[pattern])

        # Make sure the analyzer doesn't get this entity
        recognizers_store_api_mock = RecognizerStoreApiMock()
        analyze_engine = AnalyzerEngine(
            registry=MockRecognizerRegistry(recognizers_store_api_mock),
            nlp_engine=MockNlpEngine())
        text = "rocket is my favorite transportation"
        entities = ["CREDIT_CARD", "ROCKET"]

        results = analyze_engine.analyze(self.unit_test_guid,
                                         text=text,
                                         entities=entities,
                                         language='en',
                                         all_fields=False)

        assert len(results) == 0

        # Add a new recognizer for the word "rocket" (case insensitive)
        recognizers_store_api_mock.add_custom_pattern_recognizer(
            pattern_recognizer)

        # Check that the entity is recognized:
        results = analyze_engine.analyze(self.unit_test_guid,
                                         text=text,
                                         entities=entities,
                                         language='en',
                                         all_fields=False)

        assert len(results) == 1
        assert_result(results[0], "ROCKET", 0, 7, 0.8)

    def test_removed_pattern_recognizer_doesnt_work(self):
        pattern = Pattern("spaceship pattern", r'\W*(spaceship)\W*', 0.8)
        pattern_recognizer = PatternRecognizer("SPACESHIP",
                                               name="Spaceship recognizer",
                                               patterns=[pattern])

        # Make sure the analyzer doesn't get this entity
        recognizers_store_api_mock = RecognizerStoreApiMock()
        analyze_engine = AnalyzerEngine(
            registry=MockRecognizerRegistry(recognizers_store_api_mock),
            nlp_engine=MockNlpEngine())
        text = "spaceship is my favorite transportation"
        entities = ["CREDIT_CARD", "SPACESHIP"]

        results = analyze_engine.analyze(self.unit_test_guid,
                                         text=text,
                                         entities=entities,
                                         language='en',
                                         all_fields=False)

        assert len(results) == 0

        # Add a new recognizer for the word "rocket" (case insensitive)
        recognizers_store_api_mock.add_custom_pattern_recognizer(
            pattern_recognizer)
        # Check that the entity is recognized:
        results = analyze_engine.analyze(self.unit_test_guid,
                                         text=text,
                                         entities=entities,
                                         language='en',
                                         all_fields=False)
        assert len(results) == 1
        assert_result(results[0], "SPACESHIP", 0, 10, 0.8)

        # Remove recognizer
        recognizers_store_api_mock.remove_recognizer("Spaceship recognizer")
        # Test again to see we didn't get any results
        results = analyze_engine.analyze(self.unit_test_guid,
                                         text=text,
                                         entities=entities,
                                         language='en',
                                         all_fields=False)

        assert len(results) == 0

    def test_apply_with_language_returns_correct_response(self):
        request = AnalyzeRequest()
        request.analyzeTemplate.language = 'en'
        request.analyzeTemplate.resultsScoreThreshold = 0
        new_field = request.analyzeTemplate.fields.add()
        new_field.name = 'CREDIT_CARD'
        new_field.minScore = '0.5'
        request.text = "My credit card number is 4916994465041084"
        response = self.loaded_analyzer_engine.Apply(request, None)

        assert response.analyzeResults is not None

    def test_apply_with_no_language_returns_default(self):
        request = AnalyzeRequest()
        request.analyzeTemplate.language = ''
        request.analyzeTemplate.resultsScoreThreshold = 0
        new_field = request.analyzeTemplate.fields.add()
        new_field.name = 'CREDIT_CARD'
        new_field.minScore = '0.5'
        request.text = "My credit card number is 4916994465041084"
        response = self.loaded_analyzer_engine.Apply(request, None)
        assert response.analyzeResults is not None

    def test_when_allFields_is_true_return_all_fields(self):
        analyze_engine = AnalyzerEngine(registry=MockRecognizerRegistry(),
                                        nlp_engine=MockNlpEngine())
        request = AnalyzeRequest()
        request.analyzeTemplate.allFields = True
        request.analyzeTemplate.resultsScoreThreshold = 0
        request.text = " Credit card: 4095-2609-9393-4932,  my phone is 425 8829090 " \
                       "Domain: microsoft.com"
        response = analyze_engine.Apply(request, None)
        returned_entities = [
            field.field.name for field in response.analyzeResults
        ]

        assert response.analyzeResults is not None
        assert "CREDIT_CARD" in returned_entities
        assert "PHONE_NUMBER" in returned_entities
        assert "DOMAIN_NAME" in returned_entities

    def test_when_allFields_is_true_full_recognizers_list_return_all_fields(
            self):
        analyze_engine = AnalyzerEngine(registry=RecognizerRegistry(),
                                        nlp_engine=loaded_spacy_nlp_engine)
        request = AnalyzeRequest()
        request.analyzeTemplate.allFields = True
        request.text = "My name is David and I live in Seattle." \
                       "Domain: microsoft.com "
        response = analyze_engine.Apply(request, None)
        returned_entities = [
            field.field.name for field in response.analyzeResults
        ]
        assert response.analyzeResults is not None
        assert "PERSON" in returned_entities
        assert "LOCATION" in returned_entities
        assert "DOMAIN_NAME" in returned_entities

    def test_when_allFields_is_true_and_entities_not_empty_exception(self):
        analyze_engine = AnalyzerEngine(registry=RecognizerRegistry(),
                                        nlp_engine=MockNlpEngine())
        request = AnalyzeRequest()
        request.text = "My name is David and I live in Seattle." \
                       "Domain: microsoft.com "
        request.analyzeTemplate.allFields = True
        new_field = request.analyzeTemplate.fields.add()
        new_field.name = 'CREDIT_CARD'
        new_field.minScore = '0.5'
        with pytest.raises(ValueError):
            analyze_engine.Apply(request, None)

    def test_when_analyze_then_apptracer_has_value(self):
        text = "My name is Bart Simpson, and Credit card: 4095-2609-9393-4932,  my phone is 425 8829090"
        language = "en"
        entities = ["CREDIT_CARD", "PHONE_NUMBER", "PERSON"]
        analyzer_engine_with_spacy = AnalyzerEngine(
            self.loaded_registry,
            app_tracer=self.app_tracer,
            enable_trace_pii=True,
            nlp_engine=TESTS_NLP_ENGINE)
        results = analyzer_engine_with_spacy.analyze(
            correlation_id=self.unit_test_guid,
            text=text,
            entities=entities,
            language=language,
            all_fields=False,
            trace=True)
        assert len(results) == 3
        for result in results:
            assert result.analysis_explanation is not None
        assert self.app_tracer.get_msg_counter() == 2
        assert self.app_tracer.get_last_trace() is not None

    def test_when_threshold_is_zero_all_results_pass(self):
        text = " Credit card: 4095-2609-9393-4932,  my phone is 425 8829090"
        language = "en"
        entities = ["CREDIT_CARD", "PHONE_NUMBER"]

        # This analyzer engine is different from the global one, as this one
        # also loads SpaCy so it can detect the phone number entity

        analyzer_engine = AnalyzerEngine(registry=self.loaded_registry,
                                         nlp_engine=MockNlpEngine())
        results = analyzer_engine.analyze(self.unit_test_guid,
                                          text,
                                          entities,
                                          language,
                                          all_fields=False,
                                          score_threshold=0)

        assert len(results) == 2

    def test_when_threshold_is_more_than_half_only_credit_card_passes(self):
        text = " Credit card: 4095-2609-9393-4932,  my phone is 425 8829090"
        language = "en"
        entities = ["CREDIT_CARD", "PHONE_NUMBER"]

        # This analyzer engine is different from the global one, as this one
        # also loads SpaCy so it can detect the phone number entity

        analyzer_engine = AnalyzerEngine(registry=self.loaded_registry,
                                         nlp_engine=MockNlpEngine())
        results = analyzer_engine.analyze(self.unit_test_guid,
                                          text,
                                          entities,
                                          language,
                                          all_fields=False,
                                          score_threshold=0.51)

        assert len(results) == 1

    def test_when_default_threshold_is_more_than_half_only_one_passes(self):
        text = " Credit card: 4095-2609-9393-4932,  my phone is 425 8829090"
        language = "en"
        entities = ["CREDIT_CARD", "PHONE_NUMBER"]

        # This analyzer engine is different from the global one, as this one
        # also loads SpaCy so it can detect the phone number entity

        analyzer_engine = AnalyzerEngine(registry=self.loaded_registry,
                                         nlp_engine=MockNlpEngine(),
                                         default_score_threshold=0.7)
        results = analyzer_engine.analyze(self.unit_test_guid,
                                          text,
                                          entities,
                                          language,
                                          all_fields=False)

        assert len(results) == 1

    def test_when_default_threshold_is_zero_all_results_pass(self):
        text = " Credit card: 4095-2609-9393-4932,  my phone is 425 8829090"
        language = "en"
        entities = ["CREDIT_CARD", "PHONE_NUMBER"]

        # This analyzer engine is different from the global one, as this one
        # also loads SpaCy so it can detect the phone number entity

        analyzer_engine = AnalyzerEngine(registry=self.loaded_registry,
                                         nlp_engine=MockNlpEngine())
        results = analyzer_engine.analyze(self.unit_test_guid,
                                          text,
                                          entities,
                                          language,
                                          all_fields=False)

        assert len(results) == 2

    def test_demo_text(self):
        text = "Here are a few examples sentences we currently support:\n\n" \
               "Hello, my name is David Johnson and I live in Maine.\n" \
               "My credit card number is 4095-2609-9393-4932 and my " \
               "Crypto wallet id is 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ.\n\n" \
               "On September 18 I visited microsoft.com and sent an " \
               "email to [email protected],  from the IP 192.168.0.1.\n\n" \
               "My passport: 991280345 and my phone number: (212) 555-1234.\n\n" \
               "Please transfer using this IBAN IL150120690000003111111.\n\n" \
               "Can you please check the status on bank account 954567876544 " \
               "in PresidiBank?\n\n" \
               "" \
               "Kate's social security number is 078-05-1120.  " \
               "Her driver license? it is 9234567B.\n\n" \
               "" \
               "This project welcomes contributions and suggestions.\n" \
               "Most contributions require you to agree to a " \
               "Contributor License Agreement (CLA) declaring " \
               "that you have the right to, and actually do, " \
               "grant us the rights to use your contribution. " \
               "For details, visit https://cla.microsoft.com " \
               "When you submit a pull request, " \
               "a CLA-bot will automatically determine whether " \
               "you need to provide a CLA and decorate the PR " \
               "appropriately (e.g., label, comment).\n" \
               "Simply follow the instructions provided by the bot. " \
               "You will only need to do this once across all repos using our CLA.\n" \
               "This project has adopted the Microsoft Open Source Code of Conduct.\n" \
               "For more information see the Code of Conduct FAQ or " \
               "contact [email protected] with any additional questions or comments."

        language = "en"

        analyzer_engine = AnalyzerEngine(default_score_threshold=0.35,
                                         nlp_engine=loaded_spacy_nlp_engine)
        results = analyzer_engine.analyze(correlation_id=self.unit_test_guid,
                                          text=text,
                                          entities=None,
                                          language=language,
                                          all_fields=True)
        for result in results:
            logger.info(
                "Entity = {}, Text = {}, Score={}, Start={}, End={}".format(
                    result.entity_type, text[result.start:result.end],
                    result.score, result.start, result.end))
        detected_entities = [result.entity_type for result in results]

        assert len([
            entity for entity in detected_entities if entity == "CREDIT_CARD"
        ]) == 1
        assert len([
            entity for entity in detected_entities if entity == "CRYPTO"
        ]) == 1
        assert len([
            entity for entity in detected_entities if entity == "DATE_TIME"
        ]) == 1
        assert len([
            entity for entity in detected_entities if entity == "DOMAIN_NAME"
        ]) == 4
        assert len([
            entity for entity in detected_entities if entity == "EMAIL_ADDRESS"
        ]) == 2
        assert len([
            entity for entity in detected_entities if entity == "IBAN_CODE"
        ]) == 1
        assert len([
            entity for entity in detected_entities if entity == "IP_ADDRESS"
        ]) == 1
        assert len([
            entity for entity in detected_entities if entity == "LOCATION"
        ]) == 1
        assert len([
            entity for entity in detected_entities if entity == "PERSON"
        ]) == 2
        assert len([
            entity for entity in detected_entities if entity == "PHONE_NUMBER"
        ]) == 1
        assert len([
            entity for entity in detected_entities
            if entity == "US_BANK_NUMBER"
        ]) == 1
        assert len([
            entity for entity in detected_entities
            if entity == "US_DRIVER_LICENSE"
        ]) == 1
        assert len([
            entity for entity in detected_entities if entity == "US_PASSPORT"
        ]) == 1
        assert len([
            entity for entity in detected_entities if entity == "US_SSN"
        ]) == 1

        assert len(results) == 19

    def test_get_recognizers_returns_predefined(self):
        analyze_engine = AnalyzerEngine(registry=RecognizerRegistry(),
                                        nlp_engine=loaded_spacy_nlp_engine)
        request = RecognizersAllRequest(language="en")
        response = analyze_engine.GetAllRecognizers(request, None)
        # there are 15 predefined recognizers that detect the 17 entities
        assert len(response) == 15

    def test_get_recognizers_returns_custom(self):
        pattern = Pattern("rocket pattern", r'\W*(rocket)\W*', 0.8)
        pattern_recognizer = PatternRecognizer("ROCKET",
                                               name="Rocket recognizer",
                                               patterns=[pattern])

        recognizers_store_api_mock = RecognizerStoreApiMock()
        recognizers_store_api_mock.add_custom_pattern_recognizer(
            pattern_recognizer)
        analyze_engine = AnalyzerEngine(
            registry=MockRecognizerRegistry(recognizers_store_api_mock),
            nlp_engine=MockNlpEngine())
        request = RecognizersAllRequest(language="en")
        response = analyze_engine.GetAllRecognizers(request, None)
        # there are 15 predefined recognizers and one custom
        assert len(response) == 16
        rocket_recognizer = [
            recognizer for recognizer in response
            if recognizer.name == "Rocket recognizer" and recognizer.entities
            == ["ROCKET"] and recognizer.language == "en"
        ]
        assert len(rocket_recognizer) == 1

    def test_get_recognizers_returns_added_custom(self):
        pattern = Pattern("rocket pattern", r'\W*(rocket)\W*', 0.8)
        pattern_recognizer = PatternRecognizer("ROCKET",
                                               name="Rocket recognizer",
                                               patterns=[pattern])

        recognizers_store_api_mock = RecognizerStoreApiMock()

        analyze_engine = AnalyzerEngine(
            registry=MockRecognizerRegistry(recognizers_store_api_mock),
            nlp_engine=MockNlpEngine())
        request = RecognizersAllRequest(language="en")
        response = analyze_engine.GetAllRecognizers(request, None)
        # there are 15 predefined recognizers
        assert len(response) == 15
        recognizers_store_api_mock.add_custom_pattern_recognizer(
            pattern_recognizer)
        response = analyze_engine.GetAllRecognizers(request, None)
        # there are 15 predefined recognizers and one custom
        assert len(response) == 16

    def test_get_recognizers_returns_supported_language(self):
        pattern = Pattern("rocket pattern", r'\W*(rocket)\W*', 0.8)
        pattern_recognizer = PatternRecognizer("ROCKET",
                                               name="Rocket recognizer RU",
                                               patterns=[pattern],
                                               supported_language="ru")

        recognizers_store_api_mock = RecognizerStoreApiMock()
        recognizers_store_api_mock.add_custom_pattern_recognizer(
            pattern_recognizer)
        analyze_engine = AnalyzerEngine(
            registry=MockRecognizerRegistry(recognizers_store_api_mock),
            nlp_engine=MockNlpEngine())
        request = RecognizersAllRequest(language="ru")
        response = analyze_engine.GetAllRecognizers(request, None)
        # there is only 1 mocked russian recognizer
        assert len(response) == 1
def anonymize_reverse_lambda(analyzer_results, text_to_anonymize):         
    anonymized_results = anonymizer.anonymize(
        text=text_to_anonymize,
        analyzer_results=analyzer_results,            
        operators={"EMAIL_ADDRESS": OperatorConfig("custom", {"lambda": lambda x: x[::-1]})}        
    )

    return anonymized_results

def anonymize_faker_lambda(analyzer_results, text_to_anonymize):      
    anonymized_results = anonymizer.anonymize(
        text=text_to_anonymize,
        analyzer_results=analyzer_results,    
        operators={"EMAIL_ADDRESS": OperatorConfig("custom", {"lambda": lambda x: fake.safe_email()})}
    )

    return anonymized_results

fake = Faker('en_US')
fake.add_provider(internet)

analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()

text = 'The user has the following two emails: [email protected] and [email protected]'
analyzer_results = analyzer.analyze(text=text, entities=["EMAIL_ADDRESS"], language='en')
print("Origina Text: ", text)
print("Analyzer result:", analyzer_results, '\n')

print("Reverse lambda result: ",anonymize_reverse_lambda(analyzer_results, text).text, '\n')
print("Faker lambda result: ",anonymize_faker_lambda(analyzer_results, text).text, '\n')
예제 #28
0
from presidio_analyzer import AnalyzerEngine, PatternRecognizer
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities.engine import OperatorConfig

text_to_anonymize = "His name is Tom and his phone number is 212-555-5555"

analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()

analyzer_results = analyzer.analyze(text=text_to_anonymize, language='en')
print("\nPII Detection:")
print(analyzer_results)

anonymized_results = anonymizer.anonymize(
    text=text_to_anonymize,
    analyzer_results=analyzer_results,
    operators={
        "DEFAULT": OperatorConfig("replace", {"new_value": "<ANONYMIZED>"})
    })
print("\nPII Anonymization:")
print(anonymized_results.to_json())
예제 #29
0
print("\nDocument properties for", doc)
print("Author:", document.core_properties.author)
print("Last Modified By:", document.core_properties.last_modified_by)
print("Date:", document.core_properties.modified)
print("\n===============================================")

paras = document.paragraphs

doctext = ""
for i in paras:
    doctext += i.text

print("\n===============================================")
print("Extracted text from", doc)
print(doctext)
print("\n===============================================")

engine = AnalyzerEngine()

response = engine.analyze(correlation_id=0,
                          text = doctext,
                          entities=[],
                          language='en',
                          all_fields=True,
                          score_threshold=0.5)

for item in response:
    print("Start = {}, end = {}, entity = {}, confidence = {}".format(item.start,
                                                                      item.end,
                                                                      item.entity_type,
                                                                      item.score))
예제 #30
0
def analyze(text):
    analyzer = AnalyzerEngine()
    analyzer_results = analyzer.analyze(text=text, language='en')
    return analyzer_results