def test_given_empty_text_to_engine_then_we_fail(): engine = AnonymizerEngine() analyzer_result = RecognizerResult("SSN", 0, 1, 0.5) with pytest.raises( InvalidParamException, match="Invalid input, text can not be empty" ): engine.anonymize("", [analyzer_result], {})
def test_given_analyzer_result_with_an_incorrect_text_positions_then_we_fail( original_text, start, end): engine = AnonymizerEngine() analyzer_result = RecognizerResult("type", start, end, 0.5) err_msg = (f"Invalid analyzer result, start: {start} and end: " f"{end}, while text length is only 11.") with pytest.raises(InvalidParamException, match=err_msg): engine.anonymize(original_text, [analyzer_result], {})
def test_given_operator_decrypt_then_we_fail(): text = "hello world, my name is Jane Doe. My number is: 03-4453334" anonymizers_config = {"DEFAULT": OperatorConfig("decrypt", {"key": "key"})} analyzer_results = [ RecognizerResult(start=24, end=32, score=0.8, entity_type="NAME"), ] engine = AnonymizerEngine() with pytest.raises( InvalidParamException, match="Invalid operator class 'decrypt'.", ): engine.anonymize(text, analyzer_results, anonymizers_config)
class HansardTextFormatter: def __init__(self): SpacyRecognizer.ENTITIES = ["PERSON"] Replace.NEW_VALUE = 'replace_text' nlp_engine = SpacyNlpEngine() nlp_engine.nlp['en'] = spacy.load( 'en_core_web_lg', disable=["parser", "tagger", "lemmatizer"]) self.analyzer_engine = AnalyzerEngine(nlp_engine=nlp_engine) self.anonymizer_engine = AnonymizerEngine() def run_anonymizer(self, text): results = self.analyzer_engine.analyze(text=text, entities=[], language='en', score_threshold=0.5) if results: config = { "PERSON": AnonymizerConfig("replace", {"replace_text": "[GDPRREDACT]"}) } return self.anonymizer_engine.anonymize(text, results, config) @staticmethod def clean_text(text): text = text.replace('\n', '') text = text.replace('<BR />', '\n') return text def run_formatter(self, text): anon_text = self.run_anonymizer(text) cleaned_text = self.clean_text(anon_text) return cleaned_text
def pseudonymize( self, original_text: str, presidio_response: List[RecognizerResult], count: int, ): """ :param original_text: str containing the original text :param presidio_response: list of results from Presidio, to be used to know where entities are :param count: number of perturbations to return :return: List[str] with fake perturbations of original text """ presidio_response = sorted(presidio_response, key=lambda resp: resp.start) anonymizer_engine = AnonymizerEngine() anonymized_result = anonymizer_engine.anonymize( text=original_text, analyzer_results=presidio_response) templated_text = anonymized_result.text templated_text = templated_text.replace(">", "}}").replace("<", "{{") fake_texts = [ self.parse(templated_text, add_spans=False) for _ in range(count) ] return fake_texts
def perturb( self, original_text: str, presidio_response: List[RecognizerResult], count: int, genders: List[str] = None, namesets: List[str] = None, ): """ :param original_text: str containing the original text :param presidio_response: list of results from Presidio, to be used to know where entities are :param count: number of perturbations to return :param genders: gender valuse to use (options: 'female', 'male') :param namesets: name set values to use (options are values from the FakeNameGenerator NameSet column) :return: List[str] with fake perturbations of original text """ presidio_response = sorted(presidio_response, key=lambda resp: resp.start) anonymizer_engine = AnonymizerEngine() anonymized_result = anonymizer_engine.anonymize( text=original_text, analyzer_results=presidio_response) text = anonymized_result.text text = text.replace(">", "}").replace("<", "{") self.templates = [text] return [ sample.full_text for sample in self.sample_examples( count=count, genders=genders, namesets=namesets) ]
def test_given_several_results_then_we_filter_them_and_get_correct_mocked_result( ): analyzer_results = [ RecognizerResult(start=48, end=57, score=0.55, entity_type="SSN"), RecognizerResult(start=24, end=32, score=0.6, entity_type="FULL_NAME"), RecognizerResult(start=24, end=28, score=0.9, entity_type="FIRST_NAME"), RecognizerResult(start=29, end=32, score=0.6, entity_type="LAST_NAME"), RecognizerResult(start=24, end=30, score=0.8, entity_type="NAME"), RecognizerResult(start=18, end=32, score=0.8, entity_type="BLA"), RecognizerResult(start=23, end=35, score=0.8, entity_type="BLA"), RecognizerResult(start=28, end=36, score=0.8, entity_type="BLA"), RecognizerResult(start=48, end=57, score=0.95, entity_type="PHONE_NUMBER") ] operator_config = OperatorConfig("replace", {}) operator_config.operator_name = "" engine = AnonymizerEngine() engine._operate = _operate result = engine.anonymize( "hello world, my name is Jane Doe. My number is: 034453334", analyzer_results, {"DEFAULT": operator_config}) assert result.text == "Number: I am your new text!" assert len(result.items) == 1 assert result.items[0].operator == "hash" assert result.items[0].entity_type == "type" assert result.items[0].start == 0 assert result.items[0].end == 35 assert result.items[0].text == "text"
def test_given_anonymize_called_with_multiple_scenarios_then_expected_results_returned( anonymize_scenario, ): anonymizer_request_dict = json.loads( get_scenario_file_content("anonymize", f"{anonymize_scenario}.in.json")) expected_anonymize_result_json = json.loads( get_scenario_file_content("anonymize", f"{anonymize_scenario}.out.json")) items = [] for item in expected_anonymize_result_json["items"]: items.append( AnonymizedEntity( item["anonymizer"], item["entity_type"], item["start"], item["end"], item["anonymized_text"], )) expected_anonymize_result = AnonymizerResult( expected_anonymize_result_json["text"], items) engine = AnonymizerEngine() anonymizers_config = AnonymizerRequest.get_anonymizer_configs_from_json( anonymizer_request_dict) analyzer_results = AnonymizerRequest.handle_analyzer_results_json( anonymizer_request_dict) try: actual_anonymize_result = engine.anonymize( anonymizer_request_dict.get("text"), analyzer_results, anonymizers_config) except Exception as e: actual_anonymize_result = str(e) assert actual_anonymize_result == expected_anonymize_result
def test_given_text_with_pii_using_package_then_analyze_and_anonymize_complete_successfully(): text_to_test = "John Smith drivers license is AC432223" expected_response = [RecognizerResult("PERSON", 0, 10, 0.85), RecognizerResult("US_DRIVER_LICENSE", 30, 38, 0.6499999999999999) ] # Create configuration containing engine name and models configuration = { "nlp_engine_name": "spacy", "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}], } # Create NLP engine based on configuration provider = NlpEngineProvider(nlp_configuration=configuration) nlp_engine = provider.create_engine() # Pass the created NLP engine and supported_languages to the AnalyzerEngine analyzer = AnalyzerEngine( nlp_engine=nlp_engine, supported_languages=["en"] ) analyzer_results = analyzer.analyze(text_to_test, "en") for i in range(len(analyzer_results)): assert analyzer_results[i] == expected_response[i] expected_response = AnonymizerResult(text="<PERSON> drivers license is <US_DRIVER_LICENSE>") expected_response.add_item(AnonymizedEntity("replace", "US_DRIVER_LICENSE", 28, 47, "<US_DRIVER_LICENSE>")) expected_response.add_item(AnonymizedEntity("replace", "PERSON", 0, 8, "<PERSON>")) anonymizer = AnonymizerEngine() anonymizer_results = anonymizer.anonymize(text_to_test, analyzer_results) assert anonymizer_results == expected_response
class Server: """Flask server for anonymizer.""" def __init__(self): fileConfig(Path(Path(__file__).parent, LOGGING_CONF_FILE)) self.logger = logging.getLogger("presidio-anonymizer") self.logger.setLevel(os.environ.get("LOG_LEVEL", self.logger.level)) self.app = Flask(__name__) self.logger.info("Starting anonymizer engine") self.engine = AnonymizerEngine() self.decryptor = AnonymizerDecryptor() self.logger.info(WELCOME_MESSAGE) @self.app.route("/health") def health() -> str: """Return basic health probe result.""" return "Presidio Anonymizer service is up" @self.app.route("/anonymize", methods=["POST"]) def anonymize(): content = request.get_json() if not content: return ErrorResponse("Invalid request json").to_json(), 400 anonymizers_config = AnonymizerRequest.get_anonymizer_configs_from_json( content) analyzer_results = AnonymizerRequest.handle_analyzer_results_json( content) anoymizer_result = self.engine.anonymize( text=content.get("text"), analyzer_results=analyzer_results, anonymizers_config=anonymizers_config, ) return anoymizer_result.to_json() @self.app.route("/decrypt", methods=["POST"]) def decrypt() -> Union[str, Tuple[str, int]]: content = request.get_json() if not content: return ErrorResponse("Invalid request json").to_json(), 400 decrypted_text = self.decryptor.decrypt(key=content.get("key"), text=content.get("text")) return jsonify(result=decrypted_text) @self.app.route("/anonymizers", methods=["GET"]) def anonymizers() -> Tuple[str, int]: """Return a list of supported anonymizers.""" return json.dumps(self.engine.get_anonymizers()), 200 @self.app.errorhandler(InvalidParamException) def invalid_param(err): self.logger.warning( f"failed to anonymize text with validation error: {err.err_msg}" ) return ErrorResponse(err.err_msg).to_json(), 422 @self.app.errorhandler(Exception) def server_error(e): self.logger.error(f"A fatal error occurred during execution: {e}") return ErrorResponse("Internal server error").to_json(), 500
def test_given_default_anonymizer_then_we_use_it(): engine = AnonymizerEngine() text = "please REPLACE ME." analyzer_result = AnalyzerResult("SSN", 7, 17, 0.8) anonymizer_config = AnonymizerConfig("replace", {"new_value": "and thank you"}) result = engine.anonymize(text, [analyzer_result], {"DEFAULT": anonymizer_config}) assert result == "please and thank you."
def run_engine_and_validate(text: str, anonymizers_config, analyzer_results, expected_result): engine = AnonymizerEngine() try: actual_anonymize_result = engine.anonymize(text, analyzer_results, anonymizers_config) except Exception as e: actual_anonymize_result = str(e) print("********") print(actual_anonymize_result.to_json()) print("********") assert actual_anonymize_result.to_json() == expected_result
def test_given_specific_anonymizer_then_we_use_it(): engine = AnonymizerEngine() text = "please REPLACE ME." analyzer_result = RecognizerResult("SSN", 7, 17, 0.8) anonymizer_config = AnonymizerConfig("replace", {"new_value": "and thank you"}) ssn_anonymizer_config = AnonymizerConfig("redact", {}) result = engine.anonymize( text, [analyzer_result], {"DEFAULT": anonymizer_config, "SSN": ssn_anonymizer_config}, ).text assert result == "please ."
class Presidio: def __init__(self): self.analyzer = AnalyzerEngine() self.anonymizer = AnonymizerEngine() def analyze_and_anonymize(self, text) -> str: analyzer_results = self.analyzer.analyze(text=text, language='en') operators = {"DEFAULT": OperatorConfig("redact")} anonymizer_results = self.anonymizer.anonymize( text=text, analyzer_results=analyzer_results, operators=operators) return anonymizer_results.text
def anonymize_text(text: str) -> str: analyzer = AnalyzerEngine() anonymizer = AnonymizerEngine() analyzer_results = analyzer.analyze(text=text, language="en") anonymized_results = anonymizer.anonymize( text=text, analyzer_results=analyzer_results, anonymizers_config={ "DEFAULT": AnonymizerConfig("replace", {"new_value": "<ANONYMIZED>"}) }, ) return anonymized_results
def obfuscate(text): analyzer_results = analyze(text) anonymizer = AnonymizerEngine() anonymized_results = anonymizer.anonymize( text=text, analyzer_results=analyzer_results, anonymizers_config={"DEFAULT": AnonymizerConfig("replace", {"new_value": "<ANONYMIZED>"}), "PHONE_NUMBER": AnonymizerConfig("mask", {"type": "mask", "masking_char": "*", "chars_to_mask": 12, "from_end": True}), } ) output = anonymized_results return output
def anonymize_text(text: str) -> str: try: analyzer = AnalyzerEngine() anonymizer = AnonymizerEngine() analyzer_results = analyzer.analyze(text=text, language="en") anonymized_results = anonymizer.anonymize( text=text, analyzer_results=analyzer_results, operators={ "DEFAULT": AnonymizerConfig("replace", {"new_value": "<ANONYMIZED>"}) }, ) return anonymized_results.text except Exception as e: print(f"An exception occurred. {e}")
class PDM: def __init__(self, language='en'): self.analyzer = AnalyzerEngine() self.anonymizer = AnonymizerEngine() self.language = language def predict(self, text, entities_of_interest=ENTITIES_OF_INTEREST): t0 = time() analyzer_results = self.analyzer.analyze(text, entities=entities_of_interest, language=self.language) t1 = time() anonymized_results = self.anonymizer.anonymize(text=text, analyzer_results=analyzer_results) t2 = time() results = {'time_to_analyze': f'{t1-t0:.4f} seconds', 'time_to_anonymize': f'{t2-t1:.4f} seconds', 'anonymized_text': anonymized_results.text, 'detected_items': [{'start': item.start, 'end': item.end, 'entity_type': item.entity_type} for item in anonymized_results.items]} return results
def test_given_anonymize_called_with_error_scenarios_then_expected_errors_returned( ): text = "hello world, my name is Jane Doe. My number is: 03-4453334" anonymizers = { "PHONE_NUMBER": OperatorConfig("mask", { "masking_char": "non_character", "chars_to_mask": 6, "from_end": True }) } analyzer_results = [RecognizerResult("PHONE_NUMBER", 48, 57, 0.95)] engine = AnonymizerEngine() try: actual_anonymize_result = engine.anonymize(text, analyzer_results, anonymizers) except Exception as e: actual_anonymize_result = str(e) assert actual_anonymize_result == "Invalid input, masking_char must be a character"
def test_given_anonymize_called_with_error_scenarios_then_expected_errors_returned( anonymize_scenario, ): anonymizer_request_dict = json.loads( get_scenario_file_content("anonymize", f"{anonymize_scenario}.in.json")) expected_anonymize_result_json = json.loads( get_scenario_file_content("anonymize", f"{anonymize_scenario}.out.json")) engine = AnonymizerEngine() anonymizers_config = AnonymizerRequest.get_anonymizer_configs_from_json( anonymizer_request_dict) analyzer_results = AnonymizerRequest.handle_analyzer_results_json( anonymizer_request_dict) try: actual_anonymize_result = engine.anonymize( anonymizer_request_dict.get("text"), analyzer_results, anonymizers_config) except Exception as e: actual_anonymize_result = str(e) assert actual_anonymize_result == expected_anonymize_result_json
class Server: """Flask server for anonymizer.""" def __init__(self): fileConfig(Path(Path(__file__).parent, LOGGING_CONF_FILE)) self.logger = logging.getLogger("presidio-anonymizer") self.logger.setLevel(os.environ.get("LOG_LEVEL", self.logger.level)) self.app = Flask(__name__) self.logger.info("Starting anonymizer engine") self.anonymizer = AnonymizerEngine() self.deanonymize = DeanonymizeEngine() self.logger.info(WELCOME_MESSAGE) @self.app.route("/health") def health() -> str: """Return basic health probe result.""" return "Presidio Anonymizer service is up" @self.app.route("/anonymize", methods=["POST"]) def anonymize() -> Response: content = request.get_json() if not content: raise BadRequest("Invalid request json") anonymizers_config = AppEntitiesConvertor.operators_config_from_json( content.get("anonymizers")) if AppEntitiesConvertor.check_custom_operator(anonymizers_config): raise BadRequest("Custom type anonymizer is not supported") analyzer_results = AppEntitiesConvertor.analyzer_results_from_json( content.get("analyzer_results")) anoymizer_result = self.anonymizer.anonymize( text=content.get("text"), analyzer_results=analyzer_results, operators=anonymizers_config, ) return Response(anoymizer_result.to_json(), mimetype="application/json") @self.app.route("/deanonymize", methods=["POST"]) def deanonymize() -> Response: content = request.get_json() if not content: raise BadRequest("Invalid request json") text = content.get("text") deanonymize_entities = AppEntitiesConvertor.deanonymize_entities_from_json( content) deanonymize_config = AppEntitiesConvertor.operators_config_from_json( content.get("deanonymizers")) deanonymized_response = self.deanonymize.deanonymize( text=text, entities=deanonymize_entities, operators=deanonymize_config) return Response(deanonymized_response.to_json(), mimetype="application/json") @self.app.route("/anonymizers", methods=["GET"]) def anonymizers(): """Return a list of supported anonymizers.""" return jsonify(self.anonymizer.get_anonymizers()) @self.app.route("/deanonymizers", methods=["GET"]) def deanonymizers(): """Return a list of supported deanonymizers.""" return jsonify(self.deanonymize.get_deanonymizers()) @self.app.errorhandler(InvalidParamException) def invalid_param(err): self.logger.warning( f"Request failed with parameter validation error: {err.err_msg}" ) return jsonify(error=err.err_msg), 422 @self.app.errorhandler(HTTPException) def http_exception(e): return jsonify(error=e.description), e.code @self.app.errorhandler(Exception) def server_error(e): self.logger.error(f"A fatal error occurred during execution: {e}") return jsonify(error="Internal server error"), 500
class PresidioPIIAnalyzer(BaseAnalyzer): _analyzer: AnalyzerEngine = PrivateAttr() _anonymizer: AnonymizerEngine = PrivateAttr() TYPE: str = "PresidioPII" engine_config: Optional[PresidioEngineConfig] = None # To see list of supported entities refer https://microsoft.github.io/presidio/supported_entities/ # To add customer recognizers refer https://microsoft.github.io/presidio/analyzer/adding_recognizers/ entity_recognizers: Optional[List[EntityRecognizer]] = None # To find more details refer https://microsoft.github.io/presidio/anonymizer/ anonymizers_config: Optional[Dict[str, OperatorConfig]] = None def __init__(self, **data: Any): super().__init__(**data) if not self.engine_config: self.engine_config = PresidioEngineConfig() # If spacy engine then load Spacy models and select languages languages = [] for model_config in self.engine_config.models: languages.append(model_config.lang_code) # Check SpacyNlpEngine.engine_name if self.engine_config.nlp_engine_name == "spacy": try: spacy_model = __import__(model_config.model_name) spacy_model.load() logger.info( f"Spacy model {model_config.model_name} is already downloaded" ) except: logger.warning( f"Spacy model {model_config.model_name} is not downloaded" ) logger.warning( f"Downloading spacy model {model_config.model_name}, it might take some time" ) from spacy.cli import download download(model_config.model_name) # Create NLP engine based on configuration provider = NlpEngineProvider( nlp_configuration=self.engine_config.dict()) nlp_engine = provider.create_engine() # Pass the created NLP engine and supported_languages to the AnalyzerEngine self._analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=languages) # self._analyzer.registry.load_predefined_recognizers() if self.entity_recognizers: for entity_recognizer in self.entity_recognizers: self._analyzer.registry.add_recognizer(entity_recognizer) # Initialize the anonymizer with logger self._anonymizer = AnonymizerEngine() def analyze_input( self, source_response_list: List[AnalyzerRequest], analyzer_config: PresidioPIIAnalyzerConfig, language: Optional[str] = "en", **kwargs, ) -> List[AnalyzerResponse]: analyzer_output: List[AnalyzerResponse] = [] for source_response in source_response_list: analyzer_result = self._analyzer.analyze( text=source_response.processed_text, entities=analyzer_config.entities, return_decision_process=analyzer_config. return_decision_process, language=language, ) anonymized_result = None if not analyzer_config.analyze_only: anonymizers_config = (analyzer_config.anonymizers_config or self.anonymizers_config) if (source_response.processed_text is not None and len(source_response.processed_text) > 0): anonymized_result = self._anonymizer.anonymize( text=source_response.processed_text, operators=anonymizers_config, analyzer_results=analyzer_result, ) if analyzer_config.replace_original_text and anonymized_result is not None: text = anonymized_result.text else: text = source_response.processed_text analyzer_output.append( AnalyzerResponse( processed_text=text, meta=source_response.meta, segmented_data={ "analyzer_result": [vars(result) for result in analyzer_result], "anonymized_result": None if not anonymized_result else [vars(item) for item in anonymized_result.items], "anonymized_text": None if not anonymized_result else anonymized_result.text, }, source_name=source_response.source_name, )) return analyzer_output
def test_given_none_as_anonymziers_list_then_we_fall_to_default(): engine = AnonymizerEngine() text = "please REPLACE ME." analyzer_result = RecognizerResult("SSN", 7, 17, 0.8) result = engine.anonymize(text, [analyzer_result]).text assert result == "please <SSN>."
def test_given_empty_analyzers_list_then_we_get_same_text_back(): engine = AnonymizerEngine() text = "one two three" assert engine.anonymize(text, [], {}).text == text
from presidio_analyzer import AnalyzerEngine, PatternRecognizer from presidio_anonymizer import AnonymizerEngine from presidio_anonymizer.entities.engine import OperatorConfig text_to_anonymize = "His name is Tom and his phone number is 212-555-5555" analyzer = AnalyzerEngine() anonymizer = AnonymizerEngine() analyzer_results = analyzer.analyze(text=text_to_anonymize, language='en') print("\nPII Detection:") print(analyzer_results) anonymized_results = anonymizer.anonymize( text=text_to_anonymize, analyzer_results=analyzer_results, operators={ "DEFAULT": OperatorConfig("replace", {"new_value": "<ANONYMIZED>"}) }) print("\nPII Anonymization:") print(anonymized_results.to_json())