def test_remove_pattern_recognizer(self):
        pattern = Pattern("spaceship pattern", r'\W*(spaceship)\W*', 0.8)
        pattern_recognizer = PatternRecognizer("SPACESHIP",
                                               name="Spaceship recognizer",
                                               patterns=[pattern])
        # Make sure the analyzer doesn't get this entity
        recognizers_store_api_mock = RecognizerStoreApiMock()
        recognizer_registry = RecognizerRegistry(recognizers_store_api_mock)

        # Expects zero custom recognizers
        recognizers = recognizer_registry.get_custom_recognizers()
        assert len(recognizers) == 0

        # Add a new recognizer for the word "rocket" (case insensitive)
        recognizers_store_api_mock.add_custom_pattern_recognizer(
            pattern_recognizer)

        # Expects one custom recognizer
        recognizers = recognizer_registry.get_custom_recognizers()
        assert len(recognizers) == 1

        # Remove recognizer
        recognizers_store_api_mock.remove_recognizer("Spaceship recognizer")

        # Expects zero custom recognizers
        recognizers = recognizer_registry.get_custom_recognizers()
        assert len(recognizers) == 0
Exemplo n.º 2
0
    def __init__(self,
                 registry=None,
                 nlp_engine=None,
                 app_tracer=None,
                 enable_trace_pii=False,
                 default_score_threshold=None,
                 use_recognizer_store=False,
                 default_language="en"):
        """
        AnalyzerEngine class: Orchestrating the detection of PII entities
        and all related logic
        :param registry: instance of type RecognizerRegistry
        :param nlp_engine: instance of type NlpEngine
        (for example SpacyNlpEngine)
        :param app_tracer: instance of type AppTracer,
        used to trace the logic used during each request
        :param enable_trace_pii: bool,
        defines whether PII values should be traced or not.
        :param default_score_threshold: Minimum confidence value
        for detected entities to be returned
        :param use_recognizer_store Whether to call the
        Presidio Recognizer Store on every request to gather
        responses from custom recognizers as well
        (only applicable for the full Presidio service)
        """
        if not nlp_engine:
            logger.info("nlp_engine not provided. Creating new "
                        "SpacyNlpEngine instance")
            nlp_engine = NLP_ENGINES["spacy"]()
        if not registry:
            logger.info("Recognizer registry not provided. "
                        "Creating default RecognizerRegistry instance")
            if use_recognizer_store:
                recognizer_store_api = RecognizerStoreApi()
            else:
                recognizer_store_api = None
            registry = RecognizerRegistry(
                recognizer_store_api=recognizer_store_api)
        if not app_tracer:
            app_tracer = AppTracer()

        # load nlp module
        self.nlp_engine = nlp_engine
        # prepare registry
        self.registry = registry
        # load all recognizers
        if not registry.recognizers:
            registry.load_predefined_recognizers()

        self.app_tracer = app_tracer
        self.enable_trace_pii = enable_trace_pii

        self.default_score_threshold = default_score_threshold \
            if default_score_threshold \
            else 0.0

        self.default_language = default_language
def test_when_multiple_entities_from_same_recognizer_only_one_is_returned():
    registry = RecognizerRegistry()

    recognizer_supporting_two_ents = EntityRecognizer(
        supported_entities=["A", "B"], name="MyReco"
    )
    registry.add_recognizer(recognizer_supporting_two_ents)
    recognizers = registry.get_recognizers(
        language="en", entities=["A", "B"], all_fields=False
    )

    assert len(recognizers) == 1
    assert recognizers[0].name == "MyReco"
def test_when_add_pattern_recognizer_then_item_added():
    pattern = Pattern("rocket pattern", r"\W*(rocket)\W*", 0.8)
    pattern_recognizer = PatternRecognizer(
        "ROCKET", name="Rocket recognizer", patterns=[pattern]
    )

    # Create an empty recognizer registry
    recognizer_registry = RecognizerRegistry(recognizers=[])
    assert len(recognizer_registry.recognizers) == 0

    # Add a new recognizer for the word "rocket" (case insensitive)
    recognizer_registry.add_recognizer(pattern_recognizer)

    assert len(recognizer_registry.recognizers) == 1
    assert recognizer_registry.recognizers[0].patterns[0].name == "rocket pattern"
    assert recognizer_registry.recognizers[0].name == "Rocket recognizer"
Exemplo n.º 5
0
 def test_get_recognizers_returns_predefined(self):
     analyze_engine = AnalyzerEngine(registry=RecognizerRegistry(),
                                     nlp_engine=loaded_spacy_nlp_engine)
     request = RecognizersAllRequest(language="en")
     response = analyze_engine.GetAllRecognizers(request, None)
     # there are 15 predefined recognizers that detect the 17 entities
     assert len(response) == 15
Exemplo n.º 6
0
    def __init__(self,
                 registry=None,
                 nlp_engine=None,
                 app_tracer=None,
                 enable_trace_pii=False,
                 default_score_threshold=None):
        """
        AnalyzerEngine class: Orchestrating the detection of PII entities
        and all related logic
        :param registry: instance of type RecognizerRegistry
        :param nlp_engine: instance of type NlpEngine
        (for example SpacyNlpEngine)
        :param app_tracer: instance of type AppTracer,
        used to trace the logic used during each request
        :param enable_trace_pii: bool,
        defines whether PII values should be traced or not.
        :param default_score_threshold: Minimum confidence value
        for detected entities to be returned
        """
        if not nlp_engine:
            logger.info("nlp_engine not provided. Creating new "
                        "SpacyNlpEngine instance")
            from presidio_analyzer.nlp_engine import SpacyNlpEngine
            nlp_engine = SpacyNlpEngine()
        if not registry:
            logger.info("Recognizer registry not provided. "
                        "Creating default RecognizerRegistry instance")
            from presidio_analyzer import RecognizerRegistry
            registry = RecognizerRegistry()
        if not app_tracer:
            app_tracer = AppTracer()

        # load nlp module
        self.nlp_engine = nlp_engine
        # prepare registry
        self.registry = registry
        # load all recognizers
        if not registry.recognizers:
            registry.load_predefined_recognizers()

        self.app_tracer = app_tracer
        self.enable_trace_pii = enable_trace_pii

        if default_score_threshold is None:
            self.default_score_threshold = 0
        else:
            self.default_score_threshold = default_score_threshold
    def test_add_pattern_recognizer(self):
        pattern = Pattern("rocket pattern", r'\W*(rocket)\W*', 0.8)
        pattern_recognizer = PatternRecognizer("ROCKET",
                                               name="Rocket recognizer",
                                               patterns=[pattern])

        # Make sure the analyzer doesn't get this entity
        recognizers_store_api_mock = RecognizerStoreApiMock()
        recognizer_registry = RecognizerRegistry(recognizers_store_api_mock)
        recognizers = recognizer_registry.get_custom_recognizers()
        assert len(recognizers) == 0

        # Add a new recognizer for the word "rocket" (case insensitive)
        recognizers_store_api_mock.add_custom_pattern_recognizer(
            pattern_recognizer)

        recognizers = recognizer_registry.get_custom_recognizers()
        assert len(recognizers) == 1
        assert recognizers[0].patterns[0].name == "rocket pattern"
        assert recognizers[0].name == "Rocket recognizer"
def test_when_allFields_is_true_and_entities_not_empty_exception():
    analyze_engine = AnalyzerEngine(registry=RecognizerRegistry(),
                                    nlp_engine=NlpEngineMock())
    request = AnalyzeRequest()
    request.text = "My name is David and I live in Seattle." "Domain: microsoft.com "
    request.analyzeTemplate.allFields = True
    new_field = request.analyzeTemplate.fields.add()
    new_field.name = "CREDIT_CARD"
    new_field.minScore = "0.5"
    with pytest.raises(ValueError):
        analyze_engine.Apply(request, None)
Exemplo n.º 9
0
def test_when_allFields_is_true_full_recognizers_list_return_all_fields(
        nlp_engine):
    analyze_engine = AnalyzerEngine(registry=RecognizerRegistry(),
                                    nlp_engine=nlp_engine)
    request = AnalyzeRequest()
    request.analyzeTemplate.allFields = True
    request.text = "My name is David and I live in Seattle." "Domain: microsoft.com "
    response = analyze_engine.Apply(request, None)
    returned_entities = [field.field.name for field in response.analyzeResults]
    assert response.analyzeResults is not None
    assert "DOMAIN_NAME" in returned_entities
Exemplo n.º 10
0
def mock_recognizer_registry():
    pattern_recognizer1 = create_mock_pattern_recognizer("en", "PERSON", "1")
    pattern_recognizer2 = create_mock_pattern_recognizer("de", "PERSON", "2")
    pattern_recognizer3 = create_mock_pattern_recognizer("de", "ADDRESS", "3")
    pattern_recognizer4 = create_mock_pattern_recognizer("he", "ADDRESS", "4")
    pattern_recognizer5 = create_mock_custom_recognizer(
        "he", ["PERSON", "ADDRESS"], "5")
    return RecognizerRegistry([
        pattern_recognizer1,
        pattern_recognizer2,
        pattern_recognizer3,
        pattern_recognizer4,
        pattern_recognizer5,
    ], )
Exemplo n.º 11
0
    def __init__(
        self,
        registry: RecognizerRegistry = None,
        nlp_engine: NlpEngine = None,
        app_tracer: AppTracer = None,
        log_decision_process: bool = False,
        default_score_threshold: float = 0,
        supported_languages: List[str] = None,
    ):
        if not supported_languages:
            supported_languages = ["en"]

        if not nlp_engine:
            logger.info("nlp_engine not provided, creating default.")
            provider = NlpEngineProvider()
            nlp_engine = provider.create_engine()

        if not registry:
            logger.info("registry not provided, creating default.")
            registry = RecognizerRegistry()
        if not app_tracer:
            app_tracer = AppTracer()
        self.app_tracer = app_tracer

        self.supported_languages = supported_languages

        self.nlp_engine = nlp_engine
        self.registry = registry

        # load all recognizers
        if not registry.recognizers:
            registry.load_predefined_recognizers(
                nlp_engine=self.nlp_engine, languages=self.supported_languages
            )

        self.log_decision_process = log_decision_process
        self.default_score_threshold = default_score_threshold
Exemplo n.º 12
0
def test_when_entities_is_none_all_recognizers_loaded_then_return_all_fields(
    nlp_engine, ):
    analyze_engine = AnalyzerEngine(registry=RecognizerRegistry(),
                                    nlp_engine=nlp_engine)
    threshold = 0
    text = "My name is Sharon and I live in Seattle." "Domain: microsoft.com "
    response = analyze_engine.analyze(text=text,
                                      score_threshold=threshold,
                                      language="en")
    returned_entities = [response.entity_type for response in response]

    assert response is not None
    assert "PERSON" in returned_entities
    assert "LOCATION" in returned_entities
    assert "DOMAIN_NAME" in returned_entities
Exemplo n.º 13
0
 def get_mock_recognizer_registry(self):
     pattern_recognizer1 = self.get_mock_pattern_recognizer(
         "en", "PERSON", "1")
     pattern_recognizer2 = self.get_mock_pattern_recognizer(
         "de", "PERSON", "2")
     pattern_recognizer3 = self.get_mock_pattern_recognizer(
         "de", "ADDRESS", "3")
     pattern_recognizer4 = self.get_mock_pattern_recognizer(
         "he", "ADDRESS", "4")
     pattern_recognizer5 = self.get_mock_custom_recognizer(
         "he", ["PERSON", "ADDRESS"], "5")
     recognizers_store_api_mock = RecognizerStoreApiMock()
     return RecognizerRegistry(recognizers_store_api_mock, [
         pattern_recognizer1, pattern_recognizer2, pattern_recognizer3,
         pattern_recognizer4, pattern_recognizer5
     ])
Exemplo n.º 14
0
    def test_cache_logic(self):
        pattern = Pattern("rocket pattern", r'\W*(rocket)\W*', 0.8)
        pattern_recognizer = PatternRecognizer("ROCKET",
                                               name="Rocket recognizer",
                                               patterns=[pattern])

        # Negative flow
        recognizers_store_api_mock = RecognizerStoreApiMock()
        recognizer_registry = RecognizerRegistry(recognizers_store_api_mock)
        custom_recognizers = recognizer_registry.get_custom_recognizers()
        # Nothing should be returned
        assert len(custom_recognizers) == 0
        # Since no hash was returned, then no access to storage is expected
        assert recognizers_store_api_mock.times_accessed_storage == 0

        # Add a new recognizer
        recognizers_store_api_mock.add_custom_pattern_recognizer(
            pattern_recognizer, skip_hash_update=True)

        # Since the hash wasn't updated the recognizers are stale from the cache
        # without the newly added one
        custom_recognizers = recognizer_registry.get_custom_recognizers()
        assert len(custom_recognizers) == 0
        # And we also didn't accessed the underlying storage
        assert recognizers_store_api_mock.times_accessed_storage == 0

        # Positive flow
        # Now do the same only this time update the hash so it should work properly
        recognizers_store_api_mock = RecognizerStoreApiMock()
        recognizer_registry = RecognizerRegistry(recognizers_store_api_mock)

        recognizer_registry.get_custom_recognizers()
        assert recognizers_store_api_mock.times_accessed_storage == 0
        recognizers_store_api_mock.add_custom_pattern_recognizer(
            pattern_recognizer, skip_hash_update=False)
        custom_recognizers = recognizer_registry.get_custom_recognizers()
        assert len(custom_recognizers) == 1
        # Accessed again
        assert recognizers_store_api_mock.times_accessed_storage == 1
Exemplo n.º 15
0
def serve_command_handler(enable_trace_pii,
                          env_grpc_port=False,
                          grpc_port=3000):
    logger.info("Starting GRPC server")
    server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
    logger.info("GRPC started")

    logger.info("Creating RecognizerRegistry")
    registry = RecognizerRegistry()
    logger.info("RecognizerRegistry created")
    logger.info("Creating SpacyNlpEngine")
    nlp_engine = SpacyNlpEngine()
    logger.info("SpacyNlpEngine created")

    analyze_pb2_grpc.add_AnalyzeServiceServicer_to_server(
        AnalyzerEngine(registry=registry,
                       nlp_engine=nlp_engine,
                       enable_trace_pii=enable_trace_pii,
                       use_recognizer_store=True), server)

    logger.info("Added AnalyzeServiceServicer to server")

    if env_grpc_port:
        logger.info("Getting port {}".format(env_grpc_port))
        port = os.environ.get('GRPC_PORT')
        if port is not None or port != '':
            grpc_port = int(port)
    else:
        logger.info("env_grpc_port not provided. "
                    "Using grpc_port {}".format(grpc_port))

    server.add_insecure_port('[::]:' + str(grpc_port))
    logger.info("Starting GRPC listener at port {}".format(grpc_port))
    server.start()
    try:
        while True:
            time.sleep(1)
    except KeyboardInterrupt:
        server.stop(0)
Exemplo n.º 16
0
def test_when_add_recognizer_then_also_outputs_others(nlp_engine):
    pattern = Pattern("rocket pattern", r"\W*(rocket)\W*", 0.8)
    pattern_recognizer = PatternRecognizer(
        "ROCKET",
        name="Rocket recognizer",
        patterns=[pattern],
        supported_language="en",
    )
    registry = RecognizerRegistry()
    registry.add_recognizer(pattern_recognizer)
    registry.load_predefined_recognizers()

    assert len(registry.recognizers) > 1

    analyzer = AnalyzerEngine(registry=registry, nlp_engine=nlp_engine)

    text = "Michael Jones has a rocket"

    results = analyzer.analyze(text=text, language="en")
    assert len(results) == 2
Exemplo n.º 17
0
def test_when_remove_pattern_recognizer_then_item_removed():
    pattern = Pattern("spaceship pattern", r"\W*(spaceship)\W*", 0.8)
    pattern_recognizer = PatternRecognizer(
        "SPACESHIP", name="Spaceship recognizer", patterns=[pattern]
    )
    # Create an empty recognizer registry
    recognizer_registry = RecognizerRegistry(recognizers=[])
    assert len(recognizer_registry.recognizers) == 0

    # Add a new recognizer for the word "rocket" (case insensitive)
    recognizer_registry.add_recognizer(pattern_recognizer)

    # Expects one custom recognizer
    assert len(recognizer_registry.recognizers) == 1

    # Remove recognizer
    recognizer_registry.remove_recognizer("Spaceship recognizer")

    # Expects zero custom recognizers
    assert len(recognizer_registry.recognizers) == 0
Exemplo n.º 18
0
def loaded_registry():
    return RecognizerRegistry()