def test_to_dict_correct_dictionary(self): ent_recognizer = EntityRecognizer(["ENTITY"]) entity_rec_dict = ent_recognizer.to_dict() assert entity_rec_dict is not None assert entity_rec_dict['supported_entities'] == ['ENTITY'] assert entity_rec_dict['supported_language'] == 'en'
def test_when_to_dict_then_return_correct_dictionary(): ent_recognizer = EntityRecognizer(["ENTITY"]) entity_rec_dict = ent_recognizer.to_dict() assert entity_rec_dict is not None assert entity_rec_dict["supported_entities"] == ["ENTITY"] assert entity_rec_dict["supported_language"] == "en"
def test_when_remove_duplicates_different_then_entity_not_removed(): # test same result with different score will return only the highest arr = [ RecognizerResult( start=0, end=5, score=0.1, entity_type="x", analysis_explanation=AnalysisExplanation( recognizer="test", original_score=0, pattern_name="test", pattern="test", validation_result=None, ), ), RecognizerResult( start=0, end=5, score=0.5, entity_type="y", analysis_explanation=AnalysisExplanation( recognizer="test", original_score=0, pattern_name="test", pattern="test", validation_result=None, ), ), ] results = EntityRecognizer.remove_duplicates(arr) assert len(results) == 2
def __analyze_patterns( self, text: str, flags: int = None ) -> List[RecognizerResult]: """ Evaluate all patterns in the provided text. Including words in the provided deny-list :param text: text to analyze :param flags: regex flags :return: A list of RecognizerResult """ flags = flags if flags else re.DOTALL | re.MULTILINE results = [] for pattern in self.patterns: match_start_time = datetime.datetime.now() matches = re.finditer(pattern.regex, text, flags=flags) match_time = datetime.datetime.now() - match_start_time logger.debug( "--- match_time[%s]: %s.%s seconds", pattern.name, match_time.seconds, match_time.microseconds, ) for match in matches: start, end = match.span() current_match = text[start:end] # Skip empty results if current_match == "": continue score = pattern.score validation_result = self.validate_result(current_match) description = self.build_regex_explanation( self.name, pattern.name, pattern.regex, score, validation_result ) pattern_result = RecognizerResult( self.supported_entities[0], start, end, score, description ) if validation_result is not None: if validation_result: pattern_result.score = EntityRecognizer.MAX_SCORE else: pattern_result.score = EntityRecognizer.MIN_SCORE invalidation_result = self.invalidate_result(current_match) if invalidation_result is not None and invalidation_result: pattern_result.score = EntityRecognizer.MIN_SCORE if pattern_result.score > EntityRecognizer.MIN_SCORE: results.append(pattern_result) results = EntityRecognizer.remove_duplicates(results) return results
def test_from_dict_returns_instance(self): ent_rec_dict = {"supported_entities": ["A", "B", "C"], "supported_language": "he" } entity_rec = EntityRecognizer.from_dict(ent_rec_dict) assert entity_rec.supported_entities == ["A", "B", "C"] assert entity_rec.supported_language == "he" assert entity_rec.version == "0.0.1"
def test_index_finding(self): # This test uses a simulated recognize result for the following # text: "my phone number is:(425) 882-9090" match = "(425) 882-9090" # the start index of the match start = 19 tokens = ['my', 'phone', 'number', 'is:(425', ')', '882', '-', '9090'] tokens_indices = [0, 3, 9, 16, 23, 25, 28, 29] index = EntityRecognizer.find_index_of_match_token(match, start, tokens, tokens_indices) assert index == 3
def test_when_index_finding_then_succeed(): # This test uses a simulated recognize result for the following # text: "my phone number is:(425) 882-9090" match = "(425) 882-9090" # the start index of the match start = 19 tokens = ["my", "phone", "number", "is:(425", ")", "882", "-", "9090"] tokens_indices = [0, 3, 9, 16, 23, 25, 28, 29] index = EntityRecognizer._find_index_of_match_token( match, start, tokens, tokens_indices) assert index == 3
def test_when_multiple_entities_from_same_recognizer_only_one_is_returned(): registry = RecognizerRegistry() recognizer_supporting_two_ents = EntityRecognizer( supported_entities=["A", "B"], name="MyReco" ) registry.add_recognizer(recognizer_supporting_two_ents) recognizers = registry.get_recognizers( language="en", entities=["A", "B"], all_fields=False ) assert len(recognizers) == 1 assert recognizers[0].name == "MyReco"
def get_mock_custom_recognizer(self, lang, entities, name): return EntityRecognizer(supported_entities=entities, name=name, supported_language=lang)
def analyze( self, text: str, language: str, entities: Optional[List[str]] = None, correlation_id: Optional[str] = None, score_threshold: Optional[float] = None, return_decision_process: Optional[bool] = False, ) -> List[RecognizerResult]: """ Find PII entities in text using different PII recognizers for a given language. :param text: the text to analyze :param language: the language of the text :param entities: List of PII entities that should be looked for in the text. If entities=None then all entities are looked for. :param correlation_id: cross call ID for this request :param score_threshold: A minimum value for which to return an identified entity :param return_decision_process: Whether the analysis decision process steps returned in the response. :return: an array of the found entities in the text :example: >>> from presidio_analyzer import AnalyzerEngine >>> # Set up the engine, loads the NLP module (spaCy model by default) >>> # and other PII recognizers >>> analyzer = AnalyzerEngine() >>> # Call analyzer to get results >>> results = analyzer.analyze(text='My phone number is 212-555-5555', entities=['PHONE_NUMBER'], language='en') # noqa D501 >>> print(results) [type: PHONE_NUMBER, start: 19, end: 31, score: 0.85] """ all_fields = not entities recognizers = self.registry.get_recognizers( language=language, entities=entities, all_fields=all_fields ) if all_fields: # Since all_fields=True, list all entities by iterating # over all recognizers entities = self.get_supported_entities(language=language) # run the nlp pipeline over the given text, store the results in # a NlpArtifacts instance nlp_artifacts = self.nlp_engine.process_text(text, language) if self.log_decision_process: self.app_tracer.trace( correlation_id, "nlp artifacts:" + nlp_artifacts.to_json() ) results = [] for recognizer in recognizers: # Lazy loading of the relevant recognizers if not recognizer.is_loaded: recognizer.load() recognizer.is_loaded = True # analyze using the current recognizer and append the results current_results = recognizer.analyze( text=text, entities=entities, nlp_artifacts=nlp_artifacts ) if current_results: results.extend(current_results) if self.log_decision_process: self.app_tracer.trace( correlation_id, json.dumps([str(result.to_dict()) for result in results]), ) # Remove duplicates or low score results results = EntityRecognizer.remove_duplicates(results) results = self.__remove_low_scores(results, score_threshold) if not return_decision_process: results = self.__remove_decision_process(results) return results