Пример #1
0
def test_get_recognizers_returns_added_custom():
    pattern = Pattern("rocket pattern", r"\W*(rocket)\W*", 0.8)
    pattern_recognizer = PatternRecognizer("ROCKET",
                                           name="Rocket recognizer",
                                           patterns=[pattern])

    recognizers_store_api_mock = RecognizerStoreApiMock()

    analyze_engine = AnalyzerEngine(
        registry=MockRecognizerRegistry(recognizers_store_api_mock),
        nlp_engine=NlpEngineMock(),
    )
    request = RecognizersAllRequest(language="en")
    response = analyze_engine.GetAllRecognizers(request, None)
    # there are 14 predefined recognizers
    assert len(response) == 14
    recognizers_store_api_mock.add_custom_pattern_recognizer(
        pattern_recognizer)
    response = analyze_engine.GetAllRecognizers(request, None)
    # there are 14 predefined recognizers and one custom
    assert len(response) == 15
Пример #2
0
def test_when_analyze_with_multiple_predefined_recognizers_then_succeed(
        loaded_registry, unit_test_guid, nlp_engine, max_score):
    text = " Credit card: 4095-2609-9393-4932,  my phone is 425 8829090"
    language = "en"
    entities = ["CREDIT_CARD", "PHONE_NUMBER"]

    analyzer_engine_with_spacy = AnalyzerEngine(registry=loaded_registry,
                                                nlp_engine=nlp_engine)
    results = analyzer_engine_with_spacy.analyze(
        correlation_id=unit_test_guid,
        text=text,
        entities=entities,
        language=language,
    )

    assert len(results) == 2
    medium_regex_score = 0.5  # see UsPhoneRecognizer.PATTERNS
    context_similarity_factor = 0.35  # PatternRecognizer.CONTEXT_SIMILARITY_FACTOR
    assert_result(results[0], "CREDIT_CARD", 14, 33, max_score)
    expected_score = medium_regex_score + context_similarity_factor
    assert_result(results[1], "PHONE_NUMBER", 48, 59, expected_score)
def test_when_threshold_is_zero_all_results_pass(loaded_registry,
                                                 unit_test_guid):
    text = " Credit card: 4095-2609-9393-4932,  my phone is 425 8829090"
    language = "en"
    entities = ["CREDIT_CARD", "PHONE_NUMBER"]

    # This analyzer engine is different from the global one, as this one
    # also loads SpaCy so it can detect the phone number entity

    analyzer_engine = AnalyzerEngine(registry=loaded_registry,
                                     nlp_engine=NlpEngineMock())
    results = analyzer_engine.analyze(
        correlation_id=unit_test_guid,
        text=text,
        entities=entities,
        language=language,
        all_fields=False,
        score_threshold=0,
    )

    assert len(results) == 2
def test_added_pattern_recognizer_works(unit_test_guid):
    pattern = Pattern("rocket pattern", r"\W*(rocket)\W*", 0.8)
    pattern_recognizer = PatternRecognizer("ROCKET",
                                           name="Rocket recognizer",
                                           patterns=[pattern])

    # Make sure the analyzer doesn't get this entity
    recognizers_store_api_mock = RecognizerStoreApiMock()
    analyze_engine = AnalyzerEngine(
        registry=MockRecognizerRegistry(recognizers_store_api_mock),
        nlp_engine=NlpEngineMock(),
    )
    text = "rocket is my favorite transportation"
    entities = ["CREDIT_CARD", "ROCKET"]

    results = analyze_engine.analyze(
        correlation_id=unit_test_guid,
        text=text,
        entities=entities,
        language="en",
        all_fields=False,
    )

    assert len(results) == 0

    # Add a new recognizer for the word "rocket" (case insensitive)
    recognizers_store_api_mock.add_custom_pattern_recognizer(
        pattern_recognizer)

    # Check that the entity is recognized:
    results = analyze_engine.analyze(
        correlation_id=unit_test_guid,
        text=text,
        entities=entities,
        language="en",
        all_fields=False,
    )

    assert len(results) == 1
    assert_result(results[0], "ROCKET", 0, 7, 0.8)
Пример #5
0
    def test_analyze_with_multiple_predefined_recognizers(self):
        text = " Credit card: 4095-2609-9393-4932,  my phone is 425 8829090"
        language = "en"
        entities = ["CREDIT_CARD", "PHONE_NUMBER"]

        # This analyzer engine is different from the global one, as this one
        # also loads SpaCy so it can use the context words

        analyzer_engine_with_spacy = AnalyzerEngine(
            registry=self.loaded_registry, nlp_engine=loaded_spacy_nlp_engine)
        results = analyzer_engine_with_spacy.analyze(self.unit_test_guid,
                                                     text,
                                                     entities,
                                                     language,
                                                     all_fields=False)

        assert len(results) == 2
        assert_result(results[0], "CREDIT_CARD", 14, 33,
                      EntityRecognizer.MAX_SCORE)
        expected_score = UsPhoneRecognizer.MEDIUM_REGEX_SCORE + \
                         PatternRecognizer.CONTEXT_SIMILARITY_FACTOR  # 0.5 + 0.35 = 0.85
        assert_result(results[1], "PHONE_NUMBER", 48, 59, expected_score)
Пример #6
0
    def test_get_recognizers_returns_custom(self):
        pattern = Pattern("rocket pattern", r'\W*(rocket)\W*', 0.8)
        pattern_recognizer = PatternRecognizer("ROCKET",
                                               name="Rocket recognizer",
                                               patterns=[pattern])

        recognizers_store_api_mock = RecognizerStoreApiMock()
        recognizers_store_api_mock.add_custom_pattern_recognizer(
            pattern_recognizer)
        analyze_engine = AnalyzerEngine(
            registry=MockRecognizerRegistry(recognizers_store_api_mock),
            nlp_engine=MockNlpEngine())
        request = RecognizersAllRequest(language="en")
        response = analyze_engine.GetAllRecognizers(request, None)
        # there are 15 predefined recognizers and one custom
        assert len(response) == 16
        rocket_recognizer = [
            recognizer for recognizer in response
            if recognizer.name == "Rocket recognizer" and recognizer.entities
            == ["ROCKET"] and recognizer.language == "en"
        ]
        assert len(rocket_recognizer) == 1
Пример #7
0
    def test_removed_pattern_recognizer_doesnt_work(self):
        pattern = Pattern("spaceship pattern", r'\W*(spaceship)\W*', 0.8)
        pattern_recognizer = PatternRecognizer("SPACESHIP",
                                               name="Spaceship recognizer",
                                               patterns=[pattern])

        # Make sure the analyzer doesn't get this entity
        recognizers_store_api_mock = RecognizerStoreApiMock()
        analyze_engine = AnalyzerEngine(registry=MockRecognizerRegistry(
            recognizers_store_api_mock), nlp_engine=MockNlpEngine())
        text = "spaceship is my favorite transportation"
        entities = ["CREDIT_CARD", "SPACESHIP"]

        results = analyze_engine.analyze(self.unit_test_guid, text=text,
                                         entities=entities,
                                         language='en', all_fields=False)

        assert len(results) == 0

        # Add a new recognizer for the word "rocket" (case insensitive)
        recognizers_store_api_mock.add_custom_pattern_recognizer(
            pattern_recognizer)
        # Check that the entity is recognized:
        results = analyze_engine.analyze(self.unit_test_guid, text=text,
                                         entities=entities,
                                         language='en', all_fields=False)
        assert len(results) == 1
        assert_result(results[0], "SPACESHIP", 0, 10, 0.8)

        # Remove recognizer
        recognizers_store_api_mock.remove_recognizer(
            "Spaceship recognizer")
        # Test again to see we didn't get any results
        results = analyze_engine.analyze(self.unit_test_guid, text=text,
                                         entities=entities,
                                         language='en', all_fields=False)

        assert len(results) == 0
Пример #8
0
def test_when_analyze_added_pattern_recognizer_then_succeed(unit_test_guid):
    pattern = Pattern("rocket pattern", r"\W*(rocket)\W*", 0.8)
    pattern_recognizer = PatternRecognizer("ROCKET",
                                           name="Rocket recognizer",
                                           patterns=[pattern])

    mock_recognizer_registry = RecognizerRegistryMock()

    # Make sure the analyzer doesn't get this entity
    analyze_engine = AnalyzerEngine(
        registry=mock_recognizer_registry,
        nlp_engine=NlpEngineMock(),
    )
    text = "rocket is my favorite transportation"
    entities = ["CREDIT_CARD", "ROCKET"]

    results = analyze_engine.analyze(
        correlation_id=unit_test_guid,
        text=text,
        entities=entities,
        language="en",
    )

    assert len(results) == 0

    # Add a new recognizer for the word "rocket" (case insensitive)
    mock_recognizer_registry.add_recognizer(pattern_recognizer)

    # Check that the entity is recognized:
    results = analyze_engine.analyze(
        correlation_id=unit_test_guid,
        text=text,
        entities=entities,
        language="en",
    )

    assert len(results) == 1
    assert_result(results[0], "ROCKET", 0, 7, 0.8)
Пример #9
0
def test_when_analyze_then_apptracer_has_value(loaded_registry, unit_test_guid,
                                               nlp_engine):
    text = "My name is Bart Simpson, and Credit card: 4095-2609-9393-4932,  my phone is 425 8829090"  # noqa E501
    language = "en"
    entities = ["CREDIT_CARD", "PHONE_NUMBER", "PERSON"]
    app_tracer_mock = AppTracerMock(enable_decision_process=True)
    analyzer_engine_with_spacy = AnalyzerEngine(
        loaded_registry,
        app_tracer=app_tracer_mock,
        log_decision_process=True,
        nlp_engine=nlp_engine,
    )
    results = analyzer_engine_with_spacy.analyze(
        correlation_id=unit_test_guid,
        text=text,
        entities=entities,
        language=language,
        return_decision_process=True,
    )
    assert len(results) == 3
    for result in results:
        assert result.analysis_explanation is not None
    assert app_tracer_mock.get_msg_counter() == 2
    assert app_tracer_mock.get_last_trace() is not None
Пример #10
0
class PDM:
    def __init__(self, language='en'):
        self.analyzer = AnalyzerEngine()
        self.anonymizer = AnonymizerEngine()
        self.language = language

    def predict(self, text, entities_of_interest=ENTITIES_OF_INTEREST):
        t0 = time()
        analyzer_results = self.analyzer.analyze(text, entities=entities_of_interest, language=self.language)
        t1 = time()
        anonymized_results = self.anonymizer.anonymize(text=text, analyzer_results=analyzer_results)
        t2 = time()
        results = {'time_to_analyze': f'{t1-t0:.4f} seconds',
                   'time_to_anonymize': f'{t2-t1:.4f} seconds',
                   'anonymized_text': anonymized_results.text,
                   'detected_items': [{'start': item.start, 'end': item.end, 'entity_type': item.entity_type} for item in anonymized_results.items]}
        return results
Пример #11
0
 def test_remove_duplicates_different_entity_no_removal(self):
     # test same result with different score will return only the highest
     arr = [RecognizerResult(start=0, end=5, score=0.1, entity_type="x",
                             analysis_explanation=AnalysisExplanation(
                                 recognizer='test',
                                 original_score=0,
                                 pattern_name='test',
                                 pattern='test',
                                 validation_result=None)),
            RecognizerResult(start=0, end=5, score=0.5, entity_type="y",
                             analysis_explanation=AnalysisExplanation(
                                 recognizer='test',
                                 original_score=0,
                                 pattern_name='test',
                                 pattern='test',
                                 validation_result=None))]
     results = AnalyzerEngine._AnalyzerEngine__remove_duplicates(arr)
     assert len(results) == 2
Пример #12
0
    def __init__(
        self,
        analyzer_engine=AnalyzerEngine(),
        entities_to_keep: List[str] = None,
        verbose: bool = False,
        labeling_scheme="BIO",
        score_threshold=0.4,
    ):
        """
        Evaluation wrapper for the Presidio Analyzer
        :param analyzer_engine: object of type AnalyzerEngine (from presidio-analyzer)
        """
        super().__init__(
            entities_to_keep=entities_to_keep,
            verbose=verbose,
            labeling_scheme=labeling_scheme,
        )
        self.analyzer_engine = analyzer_engine

        self.score_threshold = score_threshold
Пример #13
0
def serve_command_handler(enable_trace_pii,
                          env_grpc_port=False,
                          grpc_port=3000):
    logger.info("Starting GRPC server")
    server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
    logger.info("GRPC started")

    logger.info("Creating RecognizerRegistry")
    registry = RecognizerRegistry()
    logger.info("RecognizerRegistry created")
    logger.info("Creating SpacyNlpEngine")
    nlp_engine = SpacyNlpEngine()
    logger.info("SpacyNlpEngine created")

    analyze_pb2_grpc.add_AnalyzeServiceServicer_to_server(
        AnalyzerEngine(registry=registry,
                       nlp_engine=nlp_engine,
                       enable_trace_pii=enable_trace_pii,
                       use_recognizer_store=True), server)

    logger.info("Added AnalyzeServiceServicer to server")

    if env_grpc_port:
        logger.info("Getting port {}".format(env_grpc_port))
        port = os.environ.get('GRPC_PORT')
        if port is not None or port != '':
            grpc_port = int(port)
    else:
        logger.info("env_grpc_port not provided. "
                    "Using grpc_port {}".format(grpc_port))

    server.add_insecure_port('[::]:' + str(grpc_port))
    logger.info("Starting GRPC listener at port {}".format(grpc_port))
    server.start()
    try:
        while True:
            time.sleep(1)
    except KeyboardInterrupt:
        server.stop(0)
else:
    emails_dict = {}

if os.path.isfile(names_dict_pkl_path):
    names_dict = pickle.load( open(names_dict_pkl_path, "rb") )
else:
    names_dict = {}

# Define locale and language dictionaries
faker_locales_dict = {'UNITED STATES': 'en_US', 'ITALY': 'it_IT', 'GERMANY': 'de_DE'}



# Initialize Presidio's analyzer and anonymizer
# https://microsoft.github.io/presidio/supported_entities/
analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()

# Create a copy of the source dataset
df = dataset.copy()

# Apply the function anonymizeName for each value of the Name column
df.Name = pd.Series( [anonymizeName(text, country) for (text, country) in zip(df['Name'], df['Country'])] )

# Apply the function anonymizeEmail for each value of the Email column
df.Email = pd.Series( [anonymizeEmail(text, country) for (text, country) in zip(df['Email'], df['Country'])] )

# Column Notes is 'object' data type as it contains lot of NaN and
# Pandas doesn't recognize it as string. So it has to be cast to string
# in order to be anonymized. Then replace it with its anonymization
df.Notes = pd.Series( [anonymizeName(text, country) for (text, country) in zip(df['Notes'].astype('str'), df['Country'])] )
Пример #15
0
class TestAnalyzerEngine(TestCase):
    def __init__(self, *args, **kwargs):
        super(TestAnalyzerEngine, self).__init__(*args, **kwargs)
        self.loaded_registry = MockRecognizerRegistry(RecognizerStoreApiMock())
        mock_nlp_artifacts = NlpArtifacts([], [], [], [], None, "en")
        self.app_tracer = AppTracerMock(enable_interpretability=True)
        self.loaded_analyzer_engine = AnalyzerEngine(
            self.loaded_registry,
            MockNlpEngine(stopwords=[],
                          punct_words=[],
                          nlp_artifacts=mock_nlp_artifacts),
            app_tracer=self.app_tracer,
            enable_trace_pii=True)
        self.unit_test_guid = "00000000-0000-0000-0000-000000000000"

    def test_analyze_with_predefined_recognizers_return_results(self):
        text = " Credit card: 4095-2609-9393-4932,  my phone is 425 8829090"
        language = "en"
        entities = ["CREDIT_CARD"]
        results = self.loaded_analyzer_engine.analyze(self.unit_test_guid,
                                                      text,
                                                      entities,
                                                      language,
                                                      all_fields=False)

        assert len(results) == 1
        assert_result(results[0], "CREDIT_CARD", 14, 33,
                      EntityRecognizer.MAX_SCORE)

    def test_analyze_with_multiple_predefined_recognizers(self):
        text = " Credit card: 4095-2609-9393-4932,  my phone is 425 8829090"
        language = "en"
        entities = ["CREDIT_CARD", "PHONE_NUMBER"]

        # This analyzer engine is different from the global one, as this one
        # also loads SpaCy so it can use the context words

        analyzer_engine_with_spacy = AnalyzerEngine(
            registry=self.loaded_registry, nlp_engine=loaded_spacy_nlp_engine)
        results = analyzer_engine_with_spacy.analyze(self.unit_test_guid,
                                                     text,
                                                     entities,
                                                     language,
                                                     all_fields=False)

        assert len(results) == 2
        assert_result(results[0], "CREDIT_CARD", 14, 33,
                      EntityRecognizer.MAX_SCORE)
        expected_score = UsPhoneRecognizer.MEDIUM_REGEX_SCORE + \
                         PatternRecognizer.CONTEXT_SIMILARITY_FACTOR  # 0.5 + 0.35 = 0.85
        assert_result(results[1], "PHONE_NUMBER", 48, 59, expected_score)

    def test_analyze_without_entities(self):
        with pytest.raises(ValueError):
            language = "en"
            text = " Credit card: 4095-2609-9393-4932,  my name is  John Oliver, DateTime: September 18 Domain: microsoft.com"
            entities = []
            self.loaded_analyzer_engine.analyze(self.unit_test_guid,
                                                text,
                                                entities,
                                                language,
                                                all_fields=False)

    def test_analyze_with_empty_text(self):
        language = "en"
        text = ""
        entities = ["CREDIT_CARD", "PHONE_NUMBER"]
        results = self.loaded_analyzer_engine.analyze(self.unit_test_guid,
                                                      text,
                                                      entities,
                                                      language,
                                                      all_fields=False)

        assert len(results) == 0

    def test_analyze_with_unsupported_language(self):
        with pytest.raises(ValueError):
            language = "de"
            text = ""
            entities = ["CREDIT_CARD", "PHONE_NUMBER"]
            self.loaded_analyzer_engine.analyze(self.unit_test_guid,
                                                text,
                                                entities,
                                                language,
                                                all_fields=False)

    def test_remove_duplicates(self):
        # test same result with different score will return only the highest
        arr = [
            RecognizerResult(start=0,
                             end=5,
                             score=0.1,
                             entity_type="x",
                             analysis_explanation=AnalysisExplanation(
                                 recognizer='test',
                                 original_score=0,
                                 pattern_name='test',
                                 pattern='test',
                                 validation_result=None)),
            RecognizerResult(start=0,
                             end=5,
                             score=0.5,
                             entity_type="x",
                             analysis_explanation=AnalysisExplanation(
                                 recognizer='test',
                                 original_score=0,
                                 pattern_name='test',
                                 pattern='test',
                                 validation_result=None))
        ]
        results = AnalyzerEngine._AnalyzerEngine__remove_duplicates(arr)
        assert len(results) == 1
        assert results[0].score == 0.5
        # TODO: add more cases with bug:
        # bug# 597: Analyzer remove duplicates doesn't handle all cases of one result as a substring of the other

    def test_remove_duplicates_different_entity_no_removal(self):
        # test same result with different score will return only the highest
        arr = [
            RecognizerResult(start=0,
                             end=5,
                             score=0.1,
                             entity_type="x",
                             analysis_explanation=AnalysisExplanation(
                                 recognizer='test',
                                 original_score=0,
                                 pattern_name='test',
                                 pattern='test',
                                 validation_result=None)),
            RecognizerResult(start=0,
                             end=5,
                             score=0.5,
                             entity_type="y",
                             analysis_explanation=AnalysisExplanation(
                                 recognizer='test',
                                 original_score=0,
                                 pattern_name='test',
                                 pattern='test',
                                 validation_result=None))
        ]
        results = AnalyzerEngine._AnalyzerEngine__remove_duplicates(arr)
        assert len(results) == 2

    def test_added_pattern_recognizer_works(self):
        pattern = Pattern("rocket pattern", r'\W*(rocket)\W*', 0.8)
        pattern_recognizer = PatternRecognizer("ROCKET",
                                               name="Rocket recognizer",
                                               patterns=[pattern])

        # Make sure the analyzer doesn't get this entity
        recognizers_store_api_mock = RecognizerStoreApiMock()
        analyze_engine = AnalyzerEngine(
            registry=MockRecognizerRegistry(recognizers_store_api_mock),
            nlp_engine=MockNlpEngine())
        text = "rocket is my favorite transportation"
        entities = ["CREDIT_CARD", "ROCKET"]

        results = analyze_engine.analyze(self.unit_test_guid,
                                         text=text,
                                         entities=entities,
                                         language='en',
                                         all_fields=False)

        assert len(results) == 0

        # Add a new recognizer for the word "rocket" (case insensitive)
        recognizers_store_api_mock.add_custom_pattern_recognizer(
            pattern_recognizer)

        # Check that the entity is recognized:
        results = analyze_engine.analyze(self.unit_test_guid,
                                         text=text,
                                         entities=entities,
                                         language='en',
                                         all_fields=False)

        assert len(results) == 1
        assert_result(results[0], "ROCKET", 0, 7, 0.8)

    def test_removed_pattern_recognizer_doesnt_work(self):
        pattern = Pattern("spaceship pattern", r'\W*(spaceship)\W*', 0.8)
        pattern_recognizer = PatternRecognizer("SPACESHIP",
                                               name="Spaceship recognizer",
                                               patterns=[pattern])

        # Make sure the analyzer doesn't get this entity
        recognizers_store_api_mock = RecognizerStoreApiMock()
        analyze_engine = AnalyzerEngine(
            registry=MockRecognizerRegistry(recognizers_store_api_mock),
            nlp_engine=MockNlpEngine())
        text = "spaceship is my favorite transportation"
        entities = ["CREDIT_CARD", "SPACESHIP"]

        results = analyze_engine.analyze(self.unit_test_guid,
                                         text=text,
                                         entities=entities,
                                         language='en',
                                         all_fields=False)

        assert len(results) == 0

        # Add a new recognizer for the word "rocket" (case insensitive)
        recognizers_store_api_mock.add_custom_pattern_recognizer(
            pattern_recognizer)
        # Check that the entity is recognized:
        results = analyze_engine.analyze(self.unit_test_guid,
                                         text=text,
                                         entities=entities,
                                         language='en',
                                         all_fields=False)
        assert len(results) == 1
        assert_result(results[0], "SPACESHIP", 0, 10, 0.8)

        # Remove recognizer
        recognizers_store_api_mock.remove_recognizer("Spaceship recognizer")
        # Test again to see we didn't get any results
        results = analyze_engine.analyze(self.unit_test_guid,
                                         text=text,
                                         entities=entities,
                                         language='en',
                                         all_fields=False)

        assert len(results) == 0

    def test_apply_with_language_returns_correct_response(self):
        request = AnalyzeRequest()
        request.analyzeTemplate.language = 'en'
        request.analyzeTemplate.resultsScoreThreshold = 0
        new_field = request.analyzeTemplate.fields.add()
        new_field.name = 'CREDIT_CARD'
        new_field.minScore = '0.5'
        request.text = "My credit card number is 4916994465041084"
        response = self.loaded_analyzer_engine.Apply(request, None)

        assert response.analyzeResults is not None

    def test_apply_with_no_language_returns_default(self):
        request = AnalyzeRequest()
        request.analyzeTemplate.language = ''
        request.analyzeTemplate.resultsScoreThreshold = 0
        new_field = request.analyzeTemplate.fields.add()
        new_field.name = 'CREDIT_CARD'
        new_field.minScore = '0.5'
        request.text = "My credit card number is 4916994465041084"
        response = self.loaded_analyzer_engine.Apply(request, None)
        assert response.analyzeResults is not None

    def test_when_allFields_is_true_return_all_fields(self):
        analyze_engine = AnalyzerEngine(registry=MockRecognizerRegistry(),
                                        nlp_engine=MockNlpEngine())
        request = AnalyzeRequest()
        request.analyzeTemplate.allFields = True
        request.analyzeTemplate.resultsScoreThreshold = 0
        request.text = " Credit card: 4095-2609-9393-4932,  my phone is 425 8829090 " \
                       "Domain: microsoft.com"
        response = analyze_engine.Apply(request, None)
        returned_entities = [
            field.field.name for field in response.analyzeResults
        ]

        assert response.analyzeResults is not None
        assert "CREDIT_CARD" in returned_entities
        assert "PHONE_NUMBER" in returned_entities
        assert "DOMAIN_NAME" in returned_entities

    def test_when_allFields_is_true_full_recognizers_list_return_all_fields(
            self):
        analyze_engine = AnalyzerEngine(registry=RecognizerRegistry(),
                                        nlp_engine=loaded_spacy_nlp_engine)
        request = AnalyzeRequest()
        request.analyzeTemplate.allFields = True
        request.text = "My name is David and I live in Seattle." \
                       "Domain: microsoft.com "
        response = analyze_engine.Apply(request, None)
        returned_entities = [
            field.field.name for field in response.analyzeResults
        ]
        assert response.analyzeResults is not None
        assert "PERSON" in returned_entities
        assert "LOCATION" in returned_entities
        assert "DOMAIN_NAME" in returned_entities

    def test_when_allFields_is_true_and_entities_not_empty_exception(self):
        analyze_engine = AnalyzerEngine(registry=RecognizerRegistry(),
                                        nlp_engine=MockNlpEngine())
        request = AnalyzeRequest()
        request.text = "My name is David and I live in Seattle." \
                       "Domain: microsoft.com "
        request.analyzeTemplate.allFields = True
        new_field = request.analyzeTemplate.fields.add()
        new_field.name = 'CREDIT_CARD'
        new_field.minScore = '0.5'
        with pytest.raises(ValueError):
            analyze_engine.Apply(request, None)

    def test_when_analyze_then_apptracer_has_value(self):
        text = "My name is Bart Simpson, and Credit card: 4095-2609-9393-4932,  my phone is 425 8829090"
        language = "en"
        entities = ["CREDIT_CARD", "PHONE_NUMBER", "PERSON"]
        analyzer_engine_with_spacy = AnalyzerEngine(
            self.loaded_registry,
            app_tracer=self.app_tracer,
            enable_trace_pii=True,
            nlp_engine=TESTS_NLP_ENGINE)
        results = analyzer_engine_with_spacy.analyze(
            correlation_id=self.unit_test_guid,
            text=text,
            entities=entities,
            language=language,
            all_fields=False,
            trace=True)
        assert len(results) == 3
        for result in results:
            assert result.analysis_explanation is not None
        assert self.app_tracer.get_msg_counter() == 2
        assert self.app_tracer.get_last_trace() is not None

    def test_when_threshold_is_zero_all_results_pass(self):
        text = " Credit card: 4095-2609-9393-4932,  my phone is 425 8829090"
        language = "en"
        entities = ["CREDIT_CARD", "PHONE_NUMBER"]

        # This analyzer engine is different from the global one, as this one
        # also loads SpaCy so it can detect the phone number entity

        analyzer_engine = AnalyzerEngine(registry=self.loaded_registry,
                                         nlp_engine=MockNlpEngine())
        results = analyzer_engine.analyze(self.unit_test_guid,
                                          text,
                                          entities,
                                          language,
                                          all_fields=False,
                                          score_threshold=0)

        assert len(results) == 2

    def test_when_threshold_is_more_than_half_only_credit_card_passes(self):
        text = " Credit card: 4095-2609-9393-4932,  my phone is 425 8829090"
        language = "en"
        entities = ["CREDIT_CARD", "PHONE_NUMBER"]

        # This analyzer engine is different from the global one, as this one
        # also loads SpaCy so it can detect the phone number entity

        analyzer_engine = AnalyzerEngine(registry=self.loaded_registry,
                                         nlp_engine=MockNlpEngine())
        results = analyzer_engine.analyze(self.unit_test_guid,
                                          text,
                                          entities,
                                          language,
                                          all_fields=False,
                                          score_threshold=0.51)

        assert len(results) == 1

    def test_when_default_threshold_is_more_than_half_only_one_passes(self):
        text = " Credit card: 4095-2609-9393-4932,  my phone is 425 8829090"
        language = "en"
        entities = ["CREDIT_CARD", "PHONE_NUMBER"]

        # This analyzer engine is different from the global one, as this one
        # also loads SpaCy so it can detect the phone number entity

        analyzer_engine = AnalyzerEngine(registry=self.loaded_registry,
                                         nlp_engine=MockNlpEngine(),
                                         default_score_threshold=0.7)
        results = analyzer_engine.analyze(self.unit_test_guid,
                                          text,
                                          entities,
                                          language,
                                          all_fields=False)

        assert len(results) == 1

    def test_when_default_threshold_is_zero_all_results_pass(self):
        text = " Credit card: 4095-2609-9393-4932,  my phone is 425 8829090"
        language = "en"
        entities = ["CREDIT_CARD", "PHONE_NUMBER"]

        # This analyzer engine is different from the global one, as this one
        # also loads SpaCy so it can detect the phone number entity

        analyzer_engine = AnalyzerEngine(registry=self.loaded_registry,
                                         nlp_engine=MockNlpEngine())
        results = analyzer_engine.analyze(self.unit_test_guid,
                                          text,
                                          entities,
                                          language,
                                          all_fields=False)

        assert len(results) == 2

    def test_demo_text(self):
        text = "Here are a few examples sentences we currently support:\n\n" \
               "Hello, my name is David Johnson and I live in Maine.\n" \
               "My credit card number is 4095-2609-9393-4932 and my " \
               "Crypto wallet id is 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ.\n\n" \
               "On September 18 I visited microsoft.com and sent an " \
               "email to [email protected],  from the IP 192.168.0.1.\n\n" \
               "My passport: 991280345 and my phone number: (212) 555-1234.\n\n" \
               "Please transfer using this IBAN IL150120690000003111111.\n\n" \
               "Can you please check the status on bank account 954567876544 " \
               "in PresidiBank?\n\n" \
               "" \
               "Kate's social security number is 078-05-1120.  " \
               "Her driver license? it is 9234567B.\n\n" \
               "" \
               "This project welcomes contributions and suggestions.\n" \
               "Most contributions require you to agree to a " \
               "Contributor License Agreement (CLA) declaring " \
               "that you have the right to, and actually do, " \
               "grant us the rights to use your contribution. " \
               "For details, visit https://cla.microsoft.com " \
               "When you submit a pull request, " \
               "a CLA-bot will automatically determine whether " \
               "you need to provide a CLA and decorate the PR " \
               "appropriately (e.g., label, comment).\n" \
               "Simply follow the instructions provided by the bot. " \
               "You will only need to do this once across all repos using our CLA.\n" \
               "This project has adopted the Microsoft Open Source Code of Conduct.\n" \
               "For more information see the Code of Conduct FAQ or " \
               "contact [email protected] with any additional questions or comments."

        language = "en"

        analyzer_engine = AnalyzerEngine(default_score_threshold=0.35,
                                         nlp_engine=loaded_spacy_nlp_engine)
        results = analyzer_engine.analyze(correlation_id=self.unit_test_guid,
                                          text=text,
                                          entities=None,
                                          language=language,
                                          all_fields=True)
        for result in results:
            logger.info(
                "Entity = {}, Text = {}, Score={}, Start={}, End={}".format(
                    result.entity_type, text[result.start:result.end],
                    result.score, result.start, result.end))
        detected_entities = [result.entity_type for result in results]

        assert len([
            entity for entity in detected_entities if entity == "CREDIT_CARD"
        ]) == 1
        assert len([
            entity for entity in detected_entities if entity == "CRYPTO"
        ]) == 1
        assert len([
            entity for entity in detected_entities if entity == "DATE_TIME"
        ]) == 1
        assert len([
            entity for entity in detected_entities if entity == "DOMAIN_NAME"
        ]) == 4
        assert len([
            entity for entity in detected_entities if entity == "EMAIL_ADDRESS"
        ]) == 2
        assert len([
            entity for entity in detected_entities if entity == "IBAN_CODE"
        ]) == 1
        assert len([
            entity for entity in detected_entities if entity == "IP_ADDRESS"
        ]) == 1
        assert len([
            entity for entity in detected_entities if entity == "LOCATION"
        ]) == 1
        assert len([
            entity for entity in detected_entities if entity == "PERSON"
        ]) == 2
        assert len([
            entity for entity in detected_entities if entity == "PHONE_NUMBER"
        ]) == 1
        assert len([
            entity for entity in detected_entities
            if entity == "US_BANK_NUMBER"
        ]) == 1
        assert len([
            entity for entity in detected_entities
            if entity == "US_DRIVER_LICENSE"
        ]) == 1
        assert len([
            entity for entity in detected_entities if entity == "US_PASSPORT"
        ]) == 1
        assert len([
            entity for entity in detected_entities if entity == "US_SSN"
        ]) == 1

        assert len(results) == 19

    def test_get_recognizers_returns_predefined(self):
        analyze_engine = AnalyzerEngine(registry=RecognizerRegistry(),
                                        nlp_engine=loaded_spacy_nlp_engine)
        request = RecognizersAllRequest(language="en")
        response = analyze_engine.GetAllRecognizers(request, None)
        # there are 15 predefined recognizers that detect the 17 entities
        assert len(response) == 15

    def test_get_recognizers_returns_custom(self):
        pattern = Pattern("rocket pattern", r'\W*(rocket)\W*', 0.8)
        pattern_recognizer = PatternRecognizer("ROCKET",
                                               name="Rocket recognizer",
                                               patterns=[pattern])

        recognizers_store_api_mock = RecognizerStoreApiMock()
        recognizers_store_api_mock.add_custom_pattern_recognizer(
            pattern_recognizer)
        analyze_engine = AnalyzerEngine(
            registry=MockRecognizerRegistry(recognizers_store_api_mock),
            nlp_engine=MockNlpEngine())
        request = RecognizersAllRequest(language="en")
        response = analyze_engine.GetAllRecognizers(request, None)
        # there are 15 predefined recognizers and one custom
        assert len(response) == 16
        rocket_recognizer = [
            recognizer for recognizer in response
            if recognizer.name == "Rocket recognizer" and recognizer.entities
            == ["ROCKET"] and recognizer.language == "en"
        ]
        assert len(rocket_recognizer) == 1

    def test_get_recognizers_returns_added_custom(self):
        pattern = Pattern("rocket pattern", r'\W*(rocket)\W*', 0.8)
        pattern_recognizer = PatternRecognizer("ROCKET",
                                               name="Rocket recognizer",
                                               patterns=[pattern])

        recognizers_store_api_mock = RecognizerStoreApiMock()

        analyze_engine = AnalyzerEngine(
            registry=MockRecognizerRegistry(recognizers_store_api_mock),
            nlp_engine=MockNlpEngine())
        request = RecognizersAllRequest(language="en")
        response = analyze_engine.GetAllRecognizers(request, None)
        # there are 15 predefined recognizers
        assert len(response) == 15
        recognizers_store_api_mock.add_custom_pattern_recognizer(
            pattern_recognizer)
        response = analyze_engine.GetAllRecognizers(request, None)
        # there are 15 predefined recognizers and one custom
        assert len(response) == 16

    def test_get_recognizers_returns_supported_language(self):
        pattern = Pattern("rocket pattern", r'\W*(rocket)\W*', 0.8)
        pattern_recognizer = PatternRecognizer("ROCKET",
                                               name="Rocket recognizer RU",
                                               patterns=[pattern],
                                               supported_language="ru")

        recognizers_store_api_mock = RecognizerStoreApiMock()
        recognizers_store_api_mock.add_custom_pattern_recognizer(
            pattern_recognizer)
        analyze_engine = AnalyzerEngine(
            registry=MockRecognizerRegistry(recognizers_store_api_mock),
            nlp_engine=MockNlpEngine())
        request = RecognizersAllRequest(language="ru")
        response = analyze_engine.GetAllRecognizers(request, None)
        # there is only 1 mocked russian recognizer
        assert len(response) == 1
Пример #16
0
class PresidioPIIAnalyzer(BaseAnalyzer):
    _analyzer: AnalyzerEngine = PrivateAttr()
    _anonymizer: AnonymizerEngine = PrivateAttr()
    TYPE: str = "PresidioPII"
    engine_config: Optional[PresidioEngineConfig] = None
    # To see list of supported entities refer https://microsoft.github.io/presidio/supported_entities/
    # To add customer recognizers refer https://microsoft.github.io/presidio/analyzer/adding_recognizers/
    entity_recognizers: Optional[List[EntityRecognizer]] = None
    # To find more details refer https://microsoft.github.io/presidio/anonymizer/
    anonymizers_config: Optional[Dict[str, OperatorConfig]] = None

    def __init__(self, **data: Any):
        super().__init__(**data)

        if not self.engine_config:
            self.engine_config = PresidioEngineConfig()

        # If spacy engine then load Spacy models and select languages
        languages = []
        for model_config in self.engine_config.models:
            languages.append(model_config.lang_code)

            # Check SpacyNlpEngine.engine_name
            if self.engine_config.nlp_engine_name == "spacy":
                try:
                    spacy_model = __import__(model_config.model_name)
                    spacy_model.load()
                    logger.info(
                        f"Spacy model {model_config.model_name} is already downloaded"
                    )
                except:
                    logger.warning(
                        f"Spacy model {model_config.model_name} is not downloaded"
                    )
                    logger.warning(
                        f"Downloading spacy model {model_config.model_name}, it might take some time"
                    )
                    from spacy.cli import download

                    download(model_config.model_name)

        # Create NLP engine based on configuration
        provider = NlpEngineProvider(
            nlp_configuration=self.engine_config.dict())
        nlp_engine = provider.create_engine()

        # Pass the created NLP engine and supported_languages to the AnalyzerEngine
        self._analyzer = AnalyzerEngine(nlp_engine=nlp_engine,
                                        supported_languages=languages)

        # self._analyzer.registry.load_predefined_recognizers()
        if self.entity_recognizers:
            for entity_recognizer in self.entity_recognizers:
                self._analyzer.registry.add_recognizer(entity_recognizer)

        # Initialize the anonymizer with logger
        self._anonymizer = AnonymizerEngine()

    def analyze_input(
        self,
        source_response_list: List[AnalyzerRequest],
        analyzer_config: PresidioPIIAnalyzerConfig,
        language: Optional[str] = "en",
        **kwargs,
    ) -> List[AnalyzerResponse]:
        analyzer_output: List[AnalyzerResponse] = []

        for source_response in source_response_list:
            analyzer_result = self._analyzer.analyze(
                text=source_response.processed_text,
                entities=analyzer_config.entities,
                return_decision_process=analyzer_config.
                return_decision_process,
                language=language,
            )

            anonymized_result = None
            if not analyzer_config.analyze_only:
                anonymizers_config = (analyzer_config.anonymizers_config
                                      or self.anonymizers_config)

                if (source_response.processed_text is not None
                        and len(source_response.processed_text) > 0):
                    anonymized_result = self._anonymizer.anonymize(
                        text=source_response.processed_text,
                        operators=anonymizers_config,
                        analyzer_results=analyzer_result,
                    )

            if analyzer_config.replace_original_text and anonymized_result is not None:
                text = anonymized_result.text
            else:
                text = source_response.processed_text

            analyzer_output.append(
                AnalyzerResponse(
                    processed_text=text,
                    meta=source_response.meta,
                    segmented_data={
                        "analyzer_result":
                        [vars(result) for result in analyzer_result],
                        "anonymized_result":
                        None if not anonymized_result else
                        [vars(item) for item in anonymized_result.items],
                        "anonymized_text":
                        None
                        if not anonymized_result else anonymized_result.text,
                    },
                    source_name=source_response.source_name,
                ))

        return analyzer_output
Пример #17
0
    def test_demo_text(self):
        text = "Here are a few examples sentences we currently support:\n\n" \
               "Hello, my name is David Johnson and I live in Maine.\n" \
               "My credit card number is 4095-2609-9393-4932 and my " \
               "Crypto wallet id is 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ.\n\n" \
               "On September 18 I visited microsoft.com and sent an " \
               "email to [email protected],  from the IP 192.168.0.1.\n\n" \
               "My passport: 991280345 and my phone number: (212) 555-1234.\n\n" \
               "Please transfer using this IBAN IL150120690000003111111.\n\n" \
               "Can you please check the status on bank account 954567876544 " \
               "in PresidiBank?\n\n" \
               "" \
               "Kate's social security number is 078-05-1120.  " \
               "Her driver license? it is 9234567B.\n\n" \
               "" \
               "This project welcomes contributions and suggestions.\n" \
               "Most contributions require you to agree to a " \
               "Contributor License Agreement (CLA) declaring " \
               "that you have the right to, and actually do, " \
               "grant us the rights to use your contribution. " \
               "For details, visit https://cla.microsoft.com " \
               "When you submit a pull request, " \
               "a CLA-bot will automatically determine whether " \
               "you need to provide a CLA and decorate the PR " \
               "appropriately (e.g., label, comment).\n" \
               "Simply follow the instructions provided by the bot. " \
               "You will only need to do this once across all repos using our CLA.\n" \
               "This project has adopted the Microsoft Open Source Code of Conduct.\n" \
               "For more information see the Code of Conduct FAQ or " \
               "contact [email protected] with any additional questions or comments."

        language = "en"

        analyzer_engine = AnalyzerEngine(default_score_threshold=0.35,
                                         nlp_engine=loaded_spacy_nlp_engine)
        results = analyzer_engine.analyze(correlation_id=self.unit_test_guid,
                                          text=text,
                                          entities=None,
                                          language=language,
                                          all_fields=True)
        for result in results:
            logger.info(
                "Entity = {}, Text = {}, Score={}, Start={}, End={}".format(
                    result.entity_type, text[result.start:result.end],
                    result.score, result.start, result.end))
        detected_entities = [result.entity_type for result in results]

        assert len([
            entity for entity in detected_entities if entity == "CREDIT_CARD"
        ]) == 1
        assert len([
            entity for entity in detected_entities if entity == "CRYPTO"
        ]) == 1
        assert len([
            entity for entity in detected_entities if entity == "DATE_TIME"
        ]) == 1
        assert len([
            entity for entity in detected_entities if entity == "DOMAIN_NAME"
        ]) == 4
        assert len([
            entity for entity in detected_entities if entity == "EMAIL_ADDRESS"
        ]) == 2
        assert len([
            entity for entity in detected_entities if entity == "IBAN_CODE"
        ]) == 1
        assert len([
            entity for entity in detected_entities if entity == "IP_ADDRESS"
        ]) == 1
        assert len([
            entity for entity in detected_entities if entity == "LOCATION"
        ]) == 1
        assert len([
            entity for entity in detected_entities if entity == "PERSON"
        ]) == 2
        assert len([
            entity for entity in detected_entities if entity == "PHONE_NUMBER"
        ]) == 1
        assert len([
            entity for entity in detected_entities
            if entity == "US_BANK_NUMBER"
        ]) == 1
        assert len([
            entity for entity in detected_entities
            if entity == "US_DRIVER_LICENSE"
        ]) == 1
        assert len([
            entity for entity in detected_entities if entity == "US_PASSPORT"
        ]) == 1
        assert len([
            entity for entity in detected_entities if entity == "US_SSN"
        ]) == 1

        assert len(results) == 19
Пример #18
0
def test_when_demo_text_then_return_results(unit_test_guid, nlp_engine):
    dir_path = Path(__file__).resolve().parent
    with open(Path(dir_path, "data", "demo.txt"), encoding="utf-8") as f:
        text_into_rows = f.read().split("\n")

    text_into_rows = [txt.strip() for txt in text_into_rows]
    text = " ".join(text_into_rows)
    language = "en"

    analyzer_engine = AnalyzerEngine(default_score_threshold=0.35,
                                     nlp_engine=nlp_engine)
    results = analyzer_engine.analyze(
        correlation_id=unit_test_guid,
        text=text,
        entities=None,
        language=language,
    )

    def replace_with_entity_name(original_text: str,
                                 responses: List[RecognizerResult]):
        """
        Performs replacements for every entity with its entity type
        """
        delta = 0
        new_text = original_text
        responses = sorted(responses, key=lambda x: x.start)
        for i, resp in enumerate(responses):
            # check if this response is already contained in a previous one
            if len([prev
                    for prev in responses[:i] if resp.contained_in(prev)]) > 0:
                continue
            start = resp.start + delta
            end = resp.end + delta
            entity_text = original_text[resp.start:resp.end]
            entity_type = resp.entity_type

            new_text = f"{new_text[:start]}<{entity_type}>{new_text[end:]}"
            delta += len(entity_type) + 2 - len(entity_text)

        return new_text

    actual_anonymized_text = replace_with_entity_name(text, results)

    for result in results:
        text_slice = slice(result.start, result.end)
        print("Entity = {}, Text = {}, Score={}, Start={}, End={}".format(
            result.entity_type,
            text[text_slice],
            result.score,
            result.start,
            result.end,
        ))

    with open(Path(dir_path, "data", "demo_anonymized.txt"),
              encoding="utf-8") as f_exp:
        text_into_rows = f_exp.read().split("\n")

    text_into_rows = [txt.strip() for txt in text_into_rows]
    expected_anonymized_text = " ".join(text_into_rows)

    #    assert len(results) == 19
    assert expected_anonymized_text == actual_anonymized_text
Пример #19
0
from presidio_analyzer import AnalyzerEngine, PatternRecognizer
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities.engine import OperatorConfig

text_to_anonymize = "His name is Tom and his phone number is 212-555-5555"

analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()

analyzer_results = analyzer.analyze(text=text_to_anonymize, language='en')
print("\nPII Detection:")
print(analyzer_results)

anonymized_results = anonymizer.anonymize(
    text=text_to_anonymize,
    analyzer_results=analyzer_results,
    operators={
        "DEFAULT": OperatorConfig("replace", {"new_value": "<ANONYMIZED>"})
    })
print("\nPII Anonymization:")
print(anonymized_results.to_json())
Пример #20
0
from presidio_analyzer import AnalyzerEngine, PatternRecognizer, Pattern

analyzer = AnalyzerEngine()

text1 = "Professor Plum, in the Dining Room, with the candlestick"

titles_list = [
    "Sir", "Ma'am", "Madam", "Mr.", "Mrs.", "Ms.", "Miss", "Dr.", "Professor"
]
titles_recognizer = PatternRecognizer(supported_entity="TITLE",
                                      deny_list=titles_list)
analyzer.registry.add_recognizer(titles_recognizer)

result = analyzer.analyze(text=text1, language='en')
print(f"\nDeny List result:\n {result}")

text2 = "I live in 510 Broad st."

numbers_pattern = Pattern(name="numbers_pattern", regex="\d+", score=0.5)
number_recognizer = PatternRecognizer(supported_entity="NUMBER",
                                      patterns=[numbers_pattern])

result = number_recognizer.analyze(text=text2, entities=["NUMBER"])
print(f"\nRegex result:\n {result}")
def anonymize_reverse_lambda(analyzer_results, text_to_anonymize):         
    anonymized_results = anonymizer.anonymize(
        text=text_to_anonymize,
        analyzer_results=analyzer_results,            
        operators={"EMAIL_ADDRESS": OperatorConfig("custom", {"lambda": lambda x: x[::-1]})}        
    )

    return anonymized_results

def anonymize_faker_lambda(analyzer_results, text_to_anonymize):      
    anonymized_results = anonymizer.anonymize(
        text=text_to_anonymize,
        analyzer_results=analyzer_results,    
        operators={"EMAIL_ADDRESS": OperatorConfig("custom", {"lambda": lambda x: fake.safe_email()})}
    )

    return anonymized_results

fake = Faker('en_US')
fake.add_provider(internet)

analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()

text = 'The user has the following two emails: [email protected] and [email protected]'
analyzer_results = analyzer.analyze(text=text, entities=["EMAIL_ADDRESS"], language='en')
print("Origina Text: ", text)
print("Analyzer result:", analyzer_results, '\n')

print("Reverse lambda result: ",anonymize_reverse_lambda(analyzer_results, text).text, '\n')
print("Faker lambda result: ",anonymize_faker_lambda(analyzer_results, text).text, '\n')
Пример #22
0
print("\nDocument properties for", doc)
print("Author:", document.core_properties.author)
print("Last Modified By:", document.core_properties.last_modified_by)
print("Date:", document.core_properties.modified)
print("\n===============================================")

paras = document.paragraphs

doctext = ""
for i in paras:
    doctext += i.text

print("\n===============================================")
print("Extracted text from", doc)
print(doctext)
print("\n===============================================")

engine = AnalyzerEngine()

response = engine.analyze(correlation_id=0,
                          text = doctext,
                          entities=[],
                          language='en',
                          all_fields=True,
                          score_threshold=0.5)

for item in response:
    print("Start = {}, end = {}, entity = {}, confidence = {}".format(item.start,
                                                                      item.end,
                                                                      item.entity_type,
                                                                      item.score))
Пример #23
0
 def __init__(self, language='en'):
     self.analyzer = AnalyzerEngine()
     self.anonymizer = AnonymizerEngine()
     self.language = language
Пример #24
0
 def __init__(self):
     self.analyzer = AnalyzerEngine()
     self.anonymizer = AnonymizerEngine()
Пример #25
0
def test_when_read_test_spacy_nlp_conf_file_then_returns_spacy_nlp_engine(
    mock_registry, ):
    engine = AnalyzerEngine(registry=mock_registry)

    assert isinstance(engine.nlp_engine, SpacyNlpEngine)
    assert engine.nlp_engine.nlp is not None
Пример #26
0
def analyze(text):
    analyzer = AnalyzerEngine()
    analyzer_results = analyzer.analyze(text=text, language='en')
    return analyzer_results
Пример #27
0
        Load the Presidio-supported TextAnalyticsEntityCategory
        from a yaml configuration file.
        """
        categories_file = yaml.safe_load(open(file_location))
        return [
            TextAnalyticsEntityCategory(**category)
            for category in categories_file
        ]


if __name__ == "__main__":
    import os
    from presidio_analyzer import AnalyzerEngine

    # Instruction for setting up Text Analytics and fetch instance key and endpoint:
    # https://github.com/MicrosoftDocs/azure-docs/blob/master/articles/cognitive-services/text-analytics/includes/create-text-analytics-resource.md
    text_analytics_recognizer = TextAnalyticsRecognizer(
        text_analytics_key="<YOUR_TEXT_ANALYTICS_KEY>",
        text_analytics_endpoint="<YOUR_TEXT_ANALYTICS_ENDPOINT>",
        categories_file_location=os.path.join(
            os.path.dirname(__file__),
            "example_text_analytics_entity_categories.yaml"),
    )

    analyzer = AnalyzerEngine()
    analyzer.registry.add_recognizer(text_analytics_recognizer)
    results = analyzer.analyze(
        text="David is 30 years old. His IBAN: IL150120690000003111111",
        language="en")
    print(results)