def test_presidio_psudonymize_two_entities(text, entity1, entity2, start1, end1, start2, end2, value1, value2, fake_faker): presidio_response = [ RecognizerResult(entity_type=entity1, start=start1, end=end1, score=0.85), RecognizerResult(entity_type=entity2, start=start2, end=end2, score=0.85), ] presidio_pseudonymizer = PresidioPseudonymization( custom_faker=fake_faker, lower_case_ratio=0.0, map_to_presidio_entities=False) pseudonyms = presidio_pseudonymizer.pseudonymize( original_text=text, presidio_response=presidio_response, count=5) assert len(pseudonyms) == 5 for pseudonym in pseudonyms: assert value1 in pseudonym assert value2 in pseudonym assert text[:start1].lower() in pseudonym.lower() assert text[end1:start2].lower() in pseudonym.lower()
def test_remove_duplicates(): # test same result with different score will return only the highest arr = [ RecognizerResult( start=0, end=5, score=0.1, entity_type="x", analysis_explanation=AnalysisExplanation( recognizer="test", original_score=0, pattern_name="test", pattern="test", validation_result=None, ), ), RecognizerResult( start=0, end=5, score=0.5, entity_type="x", analysis_explanation=AnalysisExplanation( recognizer="test", original_score=0, pattern_name="test", pattern="test", validation_result=None, ), ), ] results = AnalyzerEngine._AnalyzerEngine__remove_duplicates(arr) assert len(results) == 1 assert results[0].score == 0.5
def test_when_remove_duplicates_different_then_entity_not_removed(): # test same result with different score will return only the highest arr = [ RecognizerResult( start=0, end=5, score=0.1, entity_type="x", analysis_explanation=AnalysisExplanation( recognizer="test", original_score=0, pattern_name="test", pattern="test", validation_result=None, ), ), RecognizerResult( start=0, end=5, score=0.5, entity_type="y", analysis_explanation=AnalysisExplanation( recognizer="test", original_score=0, pattern_name="test", pattern="test", validation_result=None, ), ), ] results = EntityRecognizer.remove_duplicates(arr) assert len(results) == 2
def test_remove_duplicates_different_entity_no_removal(self): # test same result with different score will return only the highest arr = [ RecognizerResult(start=0, end=5, score=0.1, entity_type="x", analysis_explanation=AnalysisExplanation( recognizer='test', original_score=0, pattern_name='test', pattern='test', validation_result=None)), RecognizerResult(start=0, end=5, score=0.5, entity_type="y", analysis_explanation=AnalysisExplanation( recognizer='test', original_score=0, pattern_name='test', pattern='test', validation_result=None)) ] results = AnalyzerEngine._AnalyzerEngine__remove_duplicates(arr) assert len(results) == 2
def test_given_text_with_pii_using_package_then_analyze_and_anonymize_complete_successfully(): text_to_test = "John Smith drivers license is AC432223" expected_response = [RecognizerResult("PERSON", 0, 10, 0.85), RecognizerResult("US_DRIVER_LICENSE", 30, 38, 0.6499999999999999) ] # Create configuration containing engine name and models configuration = { "nlp_engine_name": "spacy", "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}], } # Create NLP engine based on configuration provider = NlpEngineProvider(nlp_configuration=configuration) nlp_engine = provider.create_engine() # Pass the created NLP engine and supported_languages to the AnalyzerEngine analyzer = AnalyzerEngine( nlp_engine=nlp_engine, supported_languages=["en"] ) analyzer_results = analyzer.analyze(text_to_test, "en") for i in range(len(analyzer_results)): assert analyzer_results[i] == expected_response[i] expected_response = AnonymizerResult(text="<PERSON> drivers license is <US_DRIVER_LICENSE>") expected_response.add_item(AnonymizedEntity("replace", "US_DRIVER_LICENSE", 28, 47, "<US_DRIVER_LICENSE>")) expected_response.add_item(AnonymizedEntity("replace", "PERSON", 0, 8, "<PERSON>")) anonymizer = AnonymizerEngine() anonymizer_results = anonymizer.anonymize(text_to_test, analyzer_results) assert anonymizer_results == expected_response
def __analyze_patterns( self, text: str, flags: int = None ) -> List[RecognizerResult]: """ Evaluate all patterns in the provided text. Including words in the provided deny-list :param text: text to analyze :param flags: regex flags :return: A list of RecognizerResult """ flags = flags if flags else re.DOTALL | re.MULTILINE results = [] for pattern in self.patterns: match_start_time = datetime.datetime.now() matches = re.finditer(pattern.regex, text, flags=flags) match_time = datetime.datetime.now() - match_start_time logger.debug( "--- match_time[%s]: %s.%s seconds", pattern.name, match_time.seconds, match_time.microseconds, ) for match in matches: start, end = match.span() current_match = text[start:end] # Skip empty results if current_match == "": continue score = pattern.score validation_result = self.validate_result(current_match) description = self.build_regex_explanation( self.name, pattern.name, pattern.regex, score, validation_result ) pattern_result = RecognizerResult( self.supported_entities[0], start, end, score, description ) if validation_result is not None: if validation_result: pattern_result.score = EntityRecognizer.MAX_SCORE else: pattern_result.score = EntityRecognizer.MIN_SCORE invalidation_result = self.invalidate_result(current_match) if invalidation_result is not None and invalidation_result: pattern_result.score = EntityRecognizer.MIN_SCORE if pattern_result.score > EntityRecognizer.MIN_SCORE: results.append(pattern_result) results = EntityRecognizer.remove_duplicates(results) return results
def test_assert_result_within_score_range_uses_given_range(): result = RecognizerResult(ENTITY_TYPE, 0, 10, 0.3) assert_result_within_score_range(result, ENTITY_TYPE, 0, 10, 0.2, 0.4) result = RecognizerResult(ENTITY_TYPE, 0, 10, 0.1) assert_result_within_score_range(result, ENTITY_TYPE, 0, 10, 0.05, 0.15) result = RecognizerResult(ENTITY_TYPE, 0, 10, 0.9) assert_result_within_score_range(result, ENTITY_TYPE, 0, 10, 0.89, 0.91)
def __analyze_patterns(self, text): """ Evaluates all patterns in the provided text, including words in the provided blacklist In a sentence we could get a false positive at the end of our regex, were we want to find the IBAN but not the false positive at the end of the match. i.e. "I want my deposit in DE89370400440532013000 2 days from today." :param text: text to analyze :param flags: regex flags :return: A list of RecognizerResult """ results = [] for pattern in self.patterns: matches = re.finditer(pattern.regex, text, flags=self.flags) for match in matches: for grp_num in reversed(range(1, len(match.groups()) + 1)): start = match.span(0)[0] end = ( match.span(grp_num)[1] if match.span(grp_num)[1] > 0 else match.span(0)[1] ) current_match = text[start:end] # Skip empty results if current_match == "": continue score = pattern.score validation_result = self.validate_result(current_match) description = PatternRecognizer.build_regex_explanation( self.name, pattern.name, pattern.regex, score, validation_result ) pattern_result = RecognizerResult( self.supported_entities[0], start, end, score, description ) if validation_result is not None: if validation_result: pattern_result.score = EntityRecognizer.MAX_SCORE else: pattern_result.score = EntityRecognizer.MIN_SCORE if pattern_result.score > EntityRecognizer.MIN_SCORE: results.append(pattern_result) break return results
def test_assert_result_within_score_range_uses_given_range_fails(): with pytest.raises(AssertionError): result = RecognizerResult(ENTITY_TYPE, 0, 10, 0.3) assert_result_within_score_range(result, ENTITY_TYPE, 0, 10, 0.4, 0.6) with pytest.raises(AssertionError): result = RecognizerResult(ENTITY_TYPE, 0, 10, 0) assert_result_within_score_range(result, ENTITY_TYPE, 0, 10, 0.4, 0.6) with pytest.raises(AssertionError): result = RecognizerResult(ENTITY_TYPE, 0, 10, 1) assert_result_within_score_range(result, ENTITY_TYPE, 0, 10, 0, 0.5)
def __analyze_patterns(self, text): """ Evaluates all patterns in the provided text, including words in the provided blacklist :param text: text to analyze :return: A list of RecognizerResult """ results = [] for pattern in self.patterns: match_start_time = datetime.datetime.now() matches = re.finditer(pattern.regex, text, flags=re.IGNORECASE | re.DOTALL | re.MULTILINE) match_time = datetime.datetime.now() - match_start_time self.logger.debug('--- match_time[%s]: %s.%s seconds', pattern.name, match_time.seconds, match_time.microseconds) for match in matches: start, end = match.span() current_match = text[start:end] # Skip empty results if current_match == '': continue score = pattern.score validation_result = self.validate_result(current_match) description = PatternRecognizer.build_regex_explanation( self.name, pattern.name, pattern.regex, score, validation_result) pattern_result = RecognizerResult(self.supported_entities[0], start, end, score, description) if validation_result is not None: if validation_result: pattern_result.score = EntityRecognizer.MAX_SCORE else: pattern_result.score = EntityRecognizer.MIN_SCORE if pattern_result.score > EntityRecognizer.MIN_SCORE: results.append(pattern_result) return results
def convert_to_analyze_response(self, json_obj): result = [] # None means the recognizer failed to load. if json_obj is None: return result self.logger.info(json_obj) svc_response = json_obj if not svc_response['documents']: if svc_response['errors']: self.logger.error('Text Analytics returned error: {}' .format(str(svc_response['errors']))) return result for entity in svc_response['documents'][0]['entities']: entity_type = TextAnalyticsRecognizer.__convert_to_presidio_type( entity['type'], subtype=entity.get('subtype') ) recognizer_result = \ RecognizerResult(entity_type, entity['offset'], entity['offset'] + entity['length'], entity['score'], self.DEFAULT_EXPLANATION.format(entity['type'])) result.append(recognizer_result) return result
def _recognizer_results_from_response( response: requests.Response, ) -> List[RecognizerResult]: """Translate the service's response to a list of RecognizerResult.""" results = json.loads(response.text) recognizer_results = [RecognizerResult(**result) for result in results] return recognizer_results
def test_subset_perturbation(): text = "My name is Dan" presidio_response = [ RecognizerResult(entity_type="PERSON", start=11, end=14, score=0.5) ] fake_df = pd.DataFrame({ "FIRST_NAME": ["Neta", "George"], "LAST_NAME": ["Levy", "Harrison"], "GENDER": ["Female", "Male"], "NameSet": ["Hebrew", "English"], }) ignore_types = ("DATE", "LOCATION", "ADDRESS", "GENDER") presidio_perturb = PresidioPerturb(fake_pii_df=fake_df, ignore_types=ignore_types) perturbations = presidio_perturb.perturb( original_text=text, presidio_response=presidio_response, namesets=["Hebrew"], genders=["Female"], count=5, ) for pert in perturbations: assert "neta" in pert.lower()
def create_recognizer_result(entity_type: str, score: float, start: int, end: int): data = { "entity_type": entity_type, "score": score, "start": start, "end": end } return RecognizerResult.from_json(data)
def test_given_repeat_entities_then_map_analyzer_results_returns_correct_no_of_bboxes( get_ocr_analyzer_results, ): ocr_result, text, recognizer_result = get_ocr_analyzer_results ocr_result["text"][1] = "Katie" recognizer_result.append(RecognizerResult("PERSON", 1, 6, 0.85)) text = " Katie Interiors was created by Katie Cromley." assert (len( ImageAnalyzerEngine.map_analyzer_results_to_bounding_boxes( recognizer_result, ocr_result, text)) == 3)
def test_presidio_perturb_two_entities( text, entity1, entity2, start1, end1, start2, end2 ): presidio_response = [ RecognizerResult(entity_type=entity1, start=start1, end=end1, score=0.85), RecognizerResult(entity_type=entity2, start=start2, end=end2, score=0.85), ] presidio_perturb = PresidioPerturb(fake_pii_df=get_mock_fake_df()) fake_df = presidio_perturb.fake_pii perturbations = presidio_perturb.perturb( original_text=text, presidio_response=presidio_response, count=5 ) assert len(perturbations) == 5 for perturbation in perturbations: assert fake_df[entity1].str.lower()[0] in perturbation.lower() assert fake_df[entity2].str.lower()[0] in perturbation.lower() assert text[:start1].lower() in perturbation.lower() assert text[end1:start2].lower() in perturbation.lower()
def test_entity_translation(): text = "My email is [email protected]" presidio_response = [ RecognizerResult(entity_type="EMAIL_ADDRESS", start=12, end=27, score=0.5) ] presidio_perturb = PresidioPerturb(fake_pii_df=get_mock_fake_df()) fake_df = presidio_perturb.fake_pii perturbations = presidio_perturb.perturb( original_text=text, presidio_response=presidio_response, count=1 ) assert fake_df["EMAIL_ADDRESS"].str.lower()[0] in perturbations[0]
def test_given_multiword_entity_then_map_analyzer_returns_correct_bboxes_and_len( get_ocr_analyzer_results, ): ocr_result, text, recognizer_result = get_ocr_analyzer_results # create new object for multiple words entities recognizer_result = [RecognizerResult("PERSON", 32, 46, 0.85)] expected_result = [ ImageRecognizerResult("PERSON", 32, 46, 0.85, 896, 64, 183, 40), ImageRecognizerResult("PERSON", 32, 46, 0.85, 141, 134, 190, 50), ] mapped_entities = ImageAnalyzerEngine.map_analyzer_results_to_bounding_boxes( recognizer_result, ocr_result, text) assert len(expected_result) == len(mapped_entities) assert expected_result == mapped_entities
def _convert_to_recognizer_result( self, categorized_entity: Dict) -> RecognizerResult: entity_type = self._get_presidio_entity_type(categorized_entity) return RecognizerResult( entity_type=entity_type, start=categorized_entity[TextAnalyticsClient.OFFSET], end=categorized_entity[TextAnalyticsClient.OFFSET] + categorized_entity[TextAnalyticsClient.LENGTH], score=categorized_entity[TextAnalyticsClient.CONFIDENCE_SCORE], analysis_explanation=TextAnalyticsRecognizer._build_explanation( original_score=categorized_entity[ TextAnalyticsClient.CONFIDENCE_SCORE], entity_type=entity_type, ), )
def analyze(self, text, entities, nlp_artifacts=None): results = [] if not nlp_artifacts: self.logger.warning( "Skipping SpaCy, nlp artifacts not provided...") return results ner_entities = nlp_artifacts.entities for entity in entities: if entity in self.supported_entities: for ent in ner_entities: if SpacyRecognizer.__check_label(entity, ent.label_): explanation = SpacyRecognizer.build_spacy_explanation( self.__class__.__name__, NER_STRENGTH, ent.label_) spacy_result = RecognizerResult( entity, ent.start_char, ent.end_char, NER_STRENGTH, explanation) results.append(spacy_result) return results
def analyze(self, text, entities, nlp_artifacts=None): # noqa D102 results = [] if not nlp_artifacts: logger.warning("Skipping SpaCy, nlp artifacts not provided...") return results ner_entities = nlp_artifacts.entities for entity in entities: if entity not in self.supported_entities: continue for ent in ner_entities: if not self.__check_label(entity, ent.label_, self.check_label_groups): continue textual_explanation = self.DEFAULT_EXPLANATION.format(ent.label_) explanation = self.build_spacy_explanation( self.ner_strength, textual_explanation ) spacy_result = RecognizerResult( entity, ent.start_char, ent.end_char, self.ner_strength, explanation ) results.append(spacy_result) return results