def test_remove_pattern_recognizer(self): pattern = Pattern("spaceship pattern", r'\W*(spaceship)\W*', 0.8) pattern_recognizer = PatternRecognizer("SPACESHIP", name="Spaceship recognizer", patterns=[pattern]) # Make sure the analyzer doesn't get this entity recognizers_store_api_mock = RecognizerStoreApiMock() recognizer_registry = RecognizerRegistry(recognizers_store_api_mock) # Expects zero custom recognizers recognizers = recognizer_registry.get_custom_recognizers() assert len(recognizers) == 0 # Add a new recognizer for the word "rocket" (case insensitive) recognizers_store_api_mock.add_custom_pattern_recognizer( pattern_recognizer) # Expects one custom recognizer recognizers = recognizer_registry.get_custom_recognizers() assert len(recognizers) == 1 # Remove recognizer recognizers_store_api_mock.remove_recognizer("Spaceship recognizer") # Expects zero custom recognizers recognizers = recognizer_registry.get_custom_recognizers() assert len(recognizers) == 0
def __init__(self, registry=None, nlp_engine=None, app_tracer=None, enable_trace_pii=False, default_score_threshold=None, use_recognizer_store=False, default_language="en"): """ AnalyzerEngine class: Orchestrating the detection of PII entities and all related logic :param registry: instance of type RecognizerRegistry :param nlp_engine: instance of type NlpEngine (for example SpacyNlpEngine) :param app_tracer: instance of type AppTracer, used to trace the logic used during each request :param enable_trace_pii: bool, defines whether PII values should be traced or not. :param default_score_threshold: Minimum confidence value for detected entities to be returned :param use_recognizer_store Whether to call the Presidio Recognizer Store on every request to gather responses from custom recognizers as well (only applicable for the full Presidio service) """ if not nlp_engine: logger.info("nlp_engine not provided. Creating new " "SpacyNlpEngine instance") nlp_engine = NLP_ENGINES["spacy"]() if not registry: logger.info("Recognizer registry not provided. " "Creating default RecognizerRegistry instance") if use_recognizer_store: recognizer_store_api = RecognizerStoreApi() else: recognizer_store_api = None registry = RecognizerRegistry( recognizer_store_api=recognizer_store_api) if not app_tracer: app_tracer = AppTracer() # load nlp module self.nlp_engine = nlp_engine # prepare registry self.registry = registry # load all recognizers if not registry.recognizers: registry.load_predefined_recognizers() self.app_tracer = app_tracer self.enable_trace_pii = enable_trace_pii self.default_score_threshold = default_score_threshold \ if default_score_threshold \ else 0.0 self.default_language = default_language
def test_when_multiple_entities_from_same_recognizer_only_one_is_returned(): registry = RecognizerRegistry() recognizer_supporting_two_ents = EntityRecognizer( supported_entities=["A", "B"], name="MyReco" ) registry.add_recognizer(recognizer_supporting_two_ents) recognizers = registry.get_recognizers( language="en", entities=["A", "B"], all_fields=False ) assert len(recognizers) == 1 assert recognizers[0].name == "MyReco"
def test_when_add_pattern_recognizer_then_item_added(): pattern = Pattern("rocket pattern", r"\W*(rocket)\W*", 0.8) pattern_recognizer = PatternRecognizer( "ROCKET", name="Rocket recognizer", patterns=[pattern] ) # Create an empty recognizer registry recognizer_registry = RecognizerRegistry(recognizers=[]) assert len(recognizer_registry.recognizers) == 0 # Add a new recognizer for the word "rocket" (case insensitive) recognizer_registry.add_recognizer(pattern_recognizer) assert len(recognizer_registry.recognizers) == 1 assert recognizer_registry.recognizers[0].patterns[0].name == "rocket pattern" assert recognizer_registry.recognizers[0].name == "Rocket recognizer"
def test_get_recognizers_returns_predefined(self): analyze_engine = AnalyzerEngine(registry=RecognizerRegistry(), nlp_engine=loaded_spacy_nlp_engine) request = RecognizersAllRequest(language="en") response = analyze_engine.GetAllRecognizers(request, None) # there are 15 predefined recognizers that detect the 17 entities assert len(response) == 15
def __init__(self, registry=None, nlp_engine=None, app_tracer=None, enable_trace_pii=False, default_score_threshold=None): """ AnalyzerEngine class: Orchestrating the detection of PII entities and all related logic :param registry: instance of type RecognizerRegistry :param nlp_engine: instance of type NlpEngine (for example SpacyNlpEngine) :param app_tracer: instance of type AppTracer, used to trace the logic used during each request :param enable_trace_pii: bool, defines whether PII values should be traced or not. :param default_score_threshold: Minimum confidence value for detected entities to be returned """ if not nlp_engine: logger.info("nlp_engine not provided. Creating new " "SpacyNlpEngine instance") from presidio_analyzer.nlp_engine import SpacyNlpEngine nlp_engine = SpacyNlpEngine() if not registry: logger.info("Recognizer registry not provided. " "Creating default RecognizerRegistry instance") from presidio_analyzer import RecognizerRegistry registry = RecognizerRegistry() if not app_tracer: app_tracer = AppTracer() # load nlp module self.nlp_engine = nlp_engine # prepare registry self.registry = registry # load all recognizers if not registry.recognizers: registry.load_predefined_recognizers() self.app_tracer = app_tracer self.enable_trace_pii = enable_trace_pii if default_score_threshold is None: self.default_score_threshold = 0 else: self.default_score_threshold = default_score_threshold
def test_add_pattern_recognizer(self): pattern = Pattern("rocket pattern", r'\W*(rocket)\W*', 0.8) pattern_recognizer = PatternRecognizer("ROCKET", name="Rocket recognizer", patterns=[pattern]) # Make sure the analyzer doesn't get this entity recognizers_store_api_mock = RecognizerStoreApiMock() recognizer_registry = RecognizerRegistry(recognizers_store_api_mock) recognizers = recognizer_registry.get_custom_recognizers() assert len(recognizers) == 0 # Add a new recognizer for the word "rocket" (case insensitive) recognizers_store_api_mock.add_custom_pattern_recognizer( pattern_recognizer) recognizers = recognizer_registry.get_custom_recognizers() assert len(recognizers) == 1 assert recognizers[0].patterns[0].name == "rocket pattern" assert recognizers[0].name == "Rocket recognizer"
def test_when_allFields_is_true_and_entities_not_empty_exception(): analyze_engine = AnalyzerEngine(registry=RecognizerRegistry(), nlp_engine=NlpEngineMock()) request = AnalyzeRequest() request.text = "My name is David and I live in Seattle." "Domain: microsoft.com " request.analyzeTemplate.allFields = True new_field = request.analyzeTemplate.fields.add() new_field.name = "CREDIT_CARD" new_field.minScore = "0.5" with pytest.raises(ValueError): analyze_engine.Apply(request, None)
def test_when_allFields_is_true_full_recognizers_list_return_all_fields( nlp_engine): analyze_engine = AnalyzerEngine(registry=RecognizerRegistry(), nlp_engine=nlp_engine) request = AnalyzeRequest() request.analyzeTemplate.allFields = True request.text = "My name is David and I live in Seattle." "Domain: microsoft.com " response = analyze_engine.Apply(request, None) returned_entities = [field.field.name for field in response.analyzeResults] assert response.analyzeResults is not None assert "DOMAIN_NAME" in returned_entities
def mock_recognizer_registry(): pattern_recognizer1 = create_mock_pattern_recognizer("en", "PERSON", "1") pattern_recognizer2 = create_mock_pattern_recognizer("de", "PERSON", "2") pattern_recognizer3 = create_mock_pattern_recognizer("de", "ADDRESS", "3") pattern_recognizer4 = create_mock_pattern_recognizer("he", "ADDRESS", "4") pattern_recognizer5 = create_mock_custom_recognizer( "he", ["PERSON", "ADDRESS"], "5") return RecognizerRegistry([ pattern_recognizer1, pattern_recognizer2, pattern_recognizer3, pattern_recognizer4, pattern_recognizer5, ], )
def __init__( self, registry: RecognizerRegistry = None, nlp_engine: NlpEngine = None, app_tracer: AppTracer = None, log_decision_process: bool = False, default_score_threshold: float = 0, supported_languages: List[str] = None, ): if not supported_languages: supported_languages = ["en"] if not nlp_engine: logger.info("nlp_engine not provided, creating default.") provider = NlpEngineProvider() nlp_engine = provider.create_engine() if not registry: logger.info("registry not provided, creating default.") registry = RecognizerRegistry() if not app_tracer: app_tracer = AppTracer() self.app_tracer = app_tracer self.supported_languages = supported_languages self.nlp_engine = nlp_engine self.registry = registry # load all recognizers if not registry.recognizers: registry.load_predefined_recognizers( nlp_engine=self.nlp_engine, languages=self.supported_languages ) self.log_decision_process = log_decision_process self.default_score_threshold = default_score_threshold
def test_when_entities_is_none_all_recognizers_loaded_then_return_all_fields( nlp_engine, ): analyze_engine = AnalyzerEngine(registry=RecognizerRegistry(), nlp_engine=nlp_engine) threshold = 0 text = "My name is Sharon and I live in Seattle." "Domain: microsoft.com " response = analyze_engine.analyze(text=text, score_threshold=threshold, language="en") returned_entities = [response.entity_type for response in response] assert response is not None assert "PERSON" in returned_entities assert "LOCATION" in returned_entities assert "DOMAIN_NAME" in returned_entities
def get_mock_recognizer_registry(self): pattern_recognizer1 = self.get_mock_pattern_recognizer( "en", "PERSON", "1") pattern_recognizer2 = self.get_mock_pattern_recognizer( "de", "PERSON", "2") pattern_recognizer3 = self.get_mock_pattern_recognizer( "de", "ADDRESS", "3") pattern_recognizer4 = self.get_mock_pattern_recognizer( "he", "ADDRESS", "4") pattern_recognizer5 = self.get_mock_custom_recognizer( "he", ["PERSON", "ADDRESS"], "5") recognizers_store_api_mock = RecognizerStoreApiMock() return RecognizerRegistry(recognizers_store_api_mock, [ pattern_recognizer1, pattern_recognizer2, pattern_recognizer3, pattern_recognizer4, pattern_recognizer5 ])
def test_cache_logic(self): pattern = Pattern("rocket pattern", r'\W*(rocket)\W*', 0.8) pattern_recognizer = PatternRecognizer("ROCKET", name="Rocket recognizer", patterns=[pattern]) # Negative flow recognizers_store_api_mock = RecognizerStoreApiMock() recognizer_registry = RecognizerRegistry(recognizers_store_api_mock) custom_recognizers = recognizer_registry.get_custom_recognizers() # Nothing should be returned assert len(custom_recognizers) == 0 # Since no hash was returned, then no access to storage is expected assert recognizers_store_api_mock.times_accessed_storage == 0 # Add a new recognizer recognizers_store_api_mock.add_custom_pattern_recognizer( pattern_recognizer, skip_hash_update=True) # Since the hash wasn't updated the recognizers are stale from the cache # without the newly added one custom_recognizers = recognizer_registry.get_custom_recognizers() assert len(custom_recognizers) == 0 # And we also didn't accessed the underlying storage assert recognizers_store_api_mock.times_accessed_storage == 0 # Positive flow # Now do the same only this time update the hash so it should work properly recognizers_store_api_mock = RecognizerStoreApiMock() recognizer_registry = RecognizerRegistry(recognizers_store_api_mock) recognizer_registry.get_custom_recognizers() assert recognizers_store_api_mock.times_accessed_storage == 0 recognizers_store_api_mock.add_custom_pattern_recognizer( pattern_recognizer, skip_hash_update=False) custom_recognizers = recognizer_registry.get_custom_recognizers() assert len(custom_recognizers) == 1 # Accessed again assert recognizers_store_api_mock.times_accessed_storage == 1
def serve_command_handler(enable_trace_pii, env_grpc_port=False, grpc_port=3000): logger.info("Starting GRPC server") server = grpc.server(futures.ThreadPoolExecutor(max_workers=10)) logger.info("GRPC started") logger.info("Creating RecognizerRegistry") registry = RecognizerRegistry() logger.info("RecognizerRegistry created") logger.info("Creating SpacyNlpEngine") nlp_engine = SpacyNlpEngine() logger.info("SpacyNlpEngine created") analyze_pb2_grpc.add_AnalyzeServiceServicer_to_server( AnalyzerEngine(registry=registry, nlp_engine=nlp_engine, enable_trace_pii=enable_trace_pii, use_recognizer_store=True), server) logger.info("Added AnalyzeServiceServicer to server") if env_grpc_port: logger.info("Getting port {}".format(env_grpc_port)) port = os.environ.get('GRPC_PORT') if port is not None or port != '': grpc_port = int(port) else: logger.info("env_grpc_port not provided. " "Using grpc_port {}".format(grpc_port)) server.add_insecure_port('[::]:' + str(grpc_port)) logger.info("Starting GRPC listener at port {}".format(grpc_port)) server.start() try: while True: time.sleep(1) except KeyboardInterrupt: server.stop(0)
def test_when_add_recognizer_then_also_outputs_others(nlp_engine): pattern = Pattern("rocket pattern", r"\W*(rocket)\W*", 0.8) pattern_recognizer = PatternRecognizer( "ROCKET", name="Rocket recognizer", patterns=[pattern], supported_language="en", ) registry = RecognizerRegistry() registry.add_recognizer(pattern_recognizer) registry.load_predefined_recognizers() assert len(registry.recognizers) > 1 analyzer = AnalyzerEngine(registry=registry, nlp_engine=nlp_engine) text = "Michael Jones has a rocket" results = analyzer.analyze(text=text, language="en") assert len(results) == 2
def test_when_remove_pattern_recognizer_then_item_removed(): pattern = Pattern("spaceship pattern", r"\W*(spaceship)\W*", 0.8) pattern_recognizer = PatternRecognizer( "SPACESHIP", name="Spaceship recognizer", patterns=[pattern] ) # Create an empty recognizer registry recognizer_registry = RecognizerRegistry(recognizers=[]) assert len(recognizer_registry.recognizers) == 0 # Add a new recognizer for the word "rocket" (case insensitive) recognizer_registry.add_recognizer(pattern_recognizer) # Expects one custom recognizer assert len(recognizer_registry.recognizers) == 1 # Remove recognizer recognizer_registry.remove_recognizer("Spaceship recognizer") # Expects zero custom recognizers assert len(recognizer_registry.recognizers) == 0
def loaded_registry(): return RecognizerRegistry()