Python RecognizerResult 예제들, presidio_analyzer.RecognizerResult Python 예제들

예제 #1

0

파일 보기

def test_presidio_psudonymize_two_entities(text, entity1, entity2, start1,
                                           end1, start2, end2, value1, value2,
                                           fake_faker):

    presidio_response = [
        RecognizerResult(entity_type=entity1,
                         start=start1,
                         end=end1,
                         score=0.85),
        RecognizerResult(entity_type=entity2,
                         start=start2,
                         end=end2,
                         score=0.85),
    ]
    presidio_pseudonymizer = PresidioPseudonymization(
        custom_faker=fake_faker,
        lower_case_ratio=0.0,
        map_to_presidio_entities=False)
    pseudonyms = presidio_pseudonymizer.pseudonymize(
        original_text=text, presidio_response=presidio_response, count=5)

    assert len(pseudonyms) == 5
    for pseudonym in pseudonyms:
        assert value1 in pseudonym
        assert value2 in pseudonym
        assert text[:start1].lower() in pseudonym.lower()
        assert text[end1:start2].lower() in pseudonym.lower()

예제 #2

0

파일 보기

파일: test_analyzer_engine.py 프로젝트: DeepHiveMind/AI_powered_PII-Protection-and-Anonymization_Image-Text_Service

def test_remove_duplicates():
    # test same result with different score will return only the highest
    arr = [
        RecognizerResult(
            start=0,
            end=5,
            score=0.1,
            entity_type="x",
            analysis_explanation=AnalysisExplanation(
                recognizer="test",
                original_score=0,
                pattern_name="test",
                pattern="test",
                validation_result=None,
            ),
        ),
        RecognizerResult(
            start=0,
            end=5,
            score=0.5,
            entity_type="x",
            analysis_explanation=AnalysisExplanation(
                recognizer="test",
                original_score=0,
                pattern_name="test",
                pattern="test",
                validation_result=None,
            ),
        ),
    ]
    results = AnalyzerEngine._AnalyzerEngine__remove_duplicates(arr)
    assert len(results) == 1
    assert results[0].score == 0.5

예제 #3

0

파일 보기

def test_when_remove_duplicates_different_then_entity_not_removed():
    # test same result with different score will return only the highest
    arr = [
        RecognizerResult(
            start=0,
            end=5,
            score=0.1,
            entity_type="x",
            analysis_explanation=AnalysisExplanation(
                recognizer="test",
                original_score=0,
                pattern_name="test",
                pattern="test",
                validation_result=None,
            ),
        ),
        RecognizerResult(
            start=0,
            end=5,
            score=0.5,
            entity_type="y",
            analysis_explanation=AnalysisExplanation(
                recognizer="test",
                original_score=0,
                pattern_name="test",
                pattern="test",
                validation_result=None,
            ),
        ),
    ]
    results = EntityRecognizer.remove_duplicates(arr)
    assert len(results) == 2

예제 #4

0

파일 보기

 def test_remove_duplicates_different_entity_no_removal(self):
     # test same result with different score will return only the highest
     arr = [
         RecognizerResult(start=0,
                          end=5,
                          score=0.1,
                          entity_type="x",
                          analysis_explanation=AnalysisExplanation(
                              recognizer='test',
                              original_score=0,
                              pattern_name='test',
                              pattern='test',
                              validation_result=None)),
         RecognizerResult(start=0,
                          end=5,
                          score=0.5,
                          entity_type="y",
                          analysis_explanation=AnalysisExplanation(
                              recognizer='test',
                              original_score=0,
                              pattern_name='test',
                              pattern='test',
                              validation_result=None))
     ]
     results = AnalyzerEngine._AnalyzerEngine__remove_duplicates(arr)
     assert len(results) == 2

예제 #5

0

파일 보기

def test_given_text_with_pii_using_package_then_analyze_and_anonymize_complete_successfully():
    text_to_test = "John Smith drivers license is AC432223"

    expected_response = [RecognizerResult("PERSON", 0, 10, 0.85),
                         RecognizerResult("US_DRIVER_LICENSE", 30, 38, 0.6499999999999999)
                         ]
    # Create configuration containing engine name and models
    configuration = {
        "nlp_engine_name": "spacy",
        "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
    }

    # Create NLP engine based on configuration
    provider = NlpEngineProvider(nlp_configuration=configuration)
    nlp_engine = provider.create_engine()

    # Pass the created NLP engine and supported_languages to the AnalyzerEngine
    analyzer = AnalyzerEngine(
        nlp_engine=nlp_engine,
        supported_languages=["en"]
    )
    analyzer_results = analyzer.analyze(text_to_test, "en")
    for i in range(len(analyzer_results)):
        assert analyzer_results[i] == expected_response[i]

    expected_response = AnonymizerResult(text="<PERSON> drivers license is <US_DRIVER_LICENSE>")
    expected_response.add_item(AnonymizedEntity("replace", "US_DRIVER_LICENSE", 28, 47, "<US_DRIVER_LICENSE>"))
    expected_response.add_item(AnonymizedEntity("replace", "PERSON", 0, 8, "<PERSON>"))

    anonymizer = AnonymizerEngine()
    anonymizer_results = anonymizer.anonymize(text_to_test, analyzer_results)
    assert anonymizer_results == expected_response

예제 #6

0

파일 보기

파일: pattern_recognizer.py 프로젝트: zhangabner/presidio

    def __analyze_patterns(
        self, text: str, flags: int = None
    ) -> List[RecognizerResult]:
        """
        Evaluate all patterns in the provided text.

        Including words in the provided deny-list

        :param text: text to analyze
        :param flags: regex flags
        :return: A list of RecognizerResult
        """
        flags = flags if flags else re.DOTALL | re.MULTILINE
        results = []
        for pattern in self.patterns:
            match_start_time = datetime.datetime.now()
            matches = re.finditer(pattern.regex, text, flags=flags)
            match_time = datetime.datetime.now() - match_start_time
            logger.debug(
                "--- match_time[%s]: %s.%s seconds",
                pattern.name,
                match_time.seconds,
                match_time.microseconds,
            )

            for match in matches:
                start, end = match.span()
                current_match = text[start:end]

                # Skip empty results
                if current_match == "":
                    continue

                score = pattern.score

                validation_result = self.validate_result(current_match)
                description = self.build_regex_explanation(
                    self.name, pattern.name, pattern.regex, score, validation_result
                )
                pattern_result = RecognizerResult(
                    self.supported_entities[0], start, end, score, description
                )

                if validation_result is not None:
                    if validation_result:
                        pattern_result.score = EntityRecognizer.MAX_SCORE
                    else:
                        pattern_result.score = EntityRecognizer.MIN_SCORE

                invalidation_result = self.invalidate_result(current_match)
                if invalidation_result is not None and invalidation_result:
                    pattern_result.score = EntityRecognizer.MIN_SCORE

                if pattern_result.score > EntityRecognizer.MIN_SCORE:
                    results.append(pattern_result)

        results = EntityRecognizer.remove_duplicates(results)
        return results

예제 #7

0

파일 보기

def test_assert_result_within_score_range_uses_given_range():
    result = RecognizerResult(ENTITY_TYPE, 0, 10, 0.3)
    assert_result_within_score_range(result, ENTITY_TYPE, 0, 10, 0.2, 0.4)

    result = RecognizerResult(ENTITY_TYPE, 0, 10, 0.1)
    assert_result_within_score_range(result, ENTITY_TYPE, 0, 10, 0.05, 0.15)

    result = RecognizerResult(ENTITY_TYPE, 0, 10, 0.9)
    assert_result_within_score_range(result, ENTITY_TYPE, 0, 10, 0.89, 0.91)

예제 #8

0

파일 보기

파일: iban_recognizer.py 프로젝트: naheedk-pwc/presidio

    def __analyze_patterns(self, text):
        """
        Evaluates all patterns in the provided text, including words in
         the provided blacklist

        In a sentence we could get a false positive at the end of our regex, were we
        want to find the IBAN but not the false positive at the end of the match.

        i.e. "I want my deposit in DE89370400440532013000 2 days from today."

        :param text: text to analyze
        :param flags: regex flags
        :return: A list of RecognizerResult
        """
        results = []
        for pattern in self.patterns:
            matches = re.finditer(pattern.regex, text, flags=self.flags)

            for match in matches:
                for grp_num in reversed(range(1, len(match.groups()) + 1)):
                    start = match.span(0)[0]
                    end = (
                        match.span(grp_num)[1]
                        if match.span(grp_num)[1] > 0
                        else match.span(0)[1]
                    )
                    current_match = text[start:end]

                    # Skip empty results
                    if current_match == "":
                        continue

                    score = pattern.score

                    validation_result = self.validate_result(current_match)
                    description = PatternRecognizer.build_regex_explanation(
                        self.name, pattern.name, pattern.regex, score, validation_result
                    )
                    pattern_result = RecognizerResult(
                        self.supported_entities[0], start, end, score, description
                    )

                    if validation_result is not None:
                        if validation_result:
                            pattern_result.score = EntityRecognizer.MAX_SCORE
                        else:
                            pattern_result.score = EntityRecognizer.MIN_SCORE

                    if pattern_result.score > EntityRecognizer.MIN_SCORE:
                        results.append(pattern_result)
                        break

        return results

예제 #9

0

파일 보기

def test_assert_result_within_score_range_uses_given_range_fails():

    with pytest.raises(AssertionError):
        result = RecognizerResult(ENTITY_TYPE, 0, 10, 0.3)
        assert_result_within_score_range(result, ENTITY_TYPE, 0, 10, 0.4, 0.6)

    with pytest.raises(AssertionError):
        result = RecognizerResult(ENTITY_TYPE, 0, 10, 0)
        assert_result_within_score_range(result, ENTITY_TYPE, 0, 10, 0.4, 0.6)

    with pytest.raises(AssertionError):
        result = RecognizerResult(ENTITY_TYPE, 0, 10, 1)
        assert_result_within_score_range(result, ENTITY_TYPE, 0, 10, 0, 0.5)

예제 #10

0

파일 보기

    def __analyze_patterns(self, text):
        """
        Evaluates all patterns in the provided text, including words in
         the provided blacklist

        :param text: text to analyze
        :return: A list of RecognizerResult
        """
        results = []
        for pattern in self.patterns:
            match_start_time = datetime.datetime.now()
            matches = re.finditer(pattern.regex,
                                  text,
                                  flags=re.IGNORECASE | re.DOTALL
                                  | re.MULTILINE)
            match_time = datetime.datetime.now() - match_start_time
            self.logger.debug('--- match_time[%s]: %s.%s seconds',
                              pattern.name, match_time.seconds,
                              match_time.microseconds)

            for match in matches:
                start, end = match.span()
                current_match = text[start:end]

                # Skip empty results
                if current_match == '':
                    continue

                score = pattern.score

                validation_result = self.validate_result(current_match)
                description = PatternRecognizer.build_regex_explanation(
                    self.name, pattern.name, pattern.regex, score,
                    validation_result)
                pattern_result = RecognizerResult(self.supported_entities[0],
                                                  start, end, score,
                                                  description)

                if validation_result is not None:
                    if validation_result:
                        pattern_result.score = EntityRecognizer.MAX_SCORE
                    else:
                        pattern_result.score = EntityRecognizer.MIN_SCORE

                if pattern_result.score > EntityRecognizer.MIN_SCORE:
                    results.append(pattern_result)

        return results

예제 #11

0

파일 보기

    def convert_to_analyze_response(self, json_obj):
        result = []

        # None means the recognizer failed to load.
        if json_obj is None:
            return result
        self.logger.info(json_obj)

        svc_response = json_obj
        if not svc_response['documents']:
            if svc_response['errors']:
                self.logger.error('Text Analytics returned error: {}'
                                  .format(str(svc_response['errors'])))
                return result

        for entity in svc_response['documents'][0]['entities']:
            entity_type = TextAnalyticsRecognizer.__convert_to_presidio_type(
                entity['type'],
                subtype=entity.get('subtype')
                )

            recognizer_result = \
                RecognizerResult(entity_type,
                                 entity['offset'],
                                 entity['offset'] + entity['length'],
                                 entity['score'],
                                 self.DEFAULT_EXPLANATION.format(entity['type']))
            result.append(recognizer_result)
        return result

예제 #12

0

파일 보기

    def _recognizer_results_from_response(
        response: requests.Response, ) -> List[RecognizerResult]:
        """Translate the service's response to a list of RecognizerResult."""
        results = json.loads(response.text)
        recognizer_results = [RecognizerResult(**result) for result in results]

        return recognizer_results

예제 #13

0

파일 보기

def test_subset_perturbation():
    text = "My name is Dan"
    presidio_response = [
        RecognizerResult(entity_type="PERSON", start=11, end=14, score=0.5)
    ]

    fake_df = pd.DataFrame({
        "FIRST_NAME": ["Neta", "George"],
        "LAST_NAME": ["Levy", "Harrison"],
        "GENDER": ["Female", "Male"],
        "NameSet": ["Hebrew", "English"],
    })
    ignore_types = ("DATE", "LOCATION", "ADDRESS", "GENDER")

    presidio_perturb = PresidioPerturb(fake_pii_df=fake_df,
                                       ignore_types=ignore_types)

    perturbations = presidio_perturb.perturb(
        original_text=text,
        presidio_response=presidio_response,
        namesets=["Hebrew"],
        genders=["Female"],
        count=5,
    )
    for pert in perturbations:
        assert "neta" in pert.lower()

예제 #14

0

파일 보기

def create_recognizer_result(entity_type: str, score: float, start: int,
                             end: int):
    data = {
        "entity_type": entity_type,
        "score": score,
        "start": start,
        "end": end
    }
    return RecognizerResult.from_json(data)

예제 #15

0

파일 보기

파일: test_image_analyzer_engine.py 프로젝트: zhangabner/presidio

def test_given_repeat_entities_then_map_analyzer_results_returns_correct_no_of_bboxes(
    get_ocr_analyzer_results, ):
    ocr_result, text, recognizer_result = get_ocr_analyzer_results

    ocr_result["text"][1] = "Katie"
    recognizer_result.append(RecognizerResult("PERSON", 1, 6, 0.85))
    text = " Katie Interiors was created by Katie  Cromley."

    assert (len(
        ImageAnalyzerEngine.map_analyzer_results_to_bounding_boxes(
            recognizer_result, ocr_result, text)) == 3)

예제 #16

0

파일 보기

def test_presidio_perturb_two_entities(
    text, entity1, entity2, start1, end1, start2, end2
):

    presidio_response = [
        RecognizerResult(entity_type=entity1, start=start1, end=end1, score=0.85),
        RecognizerResult(entity_type=entity2, start=start2, end=end2, score=0.85),
    ]
    presidio_perturb = PresidioPerturb(fake_pii_df=get_mock_fake_df())
    fake_df = presidio_perturb.fake_pii
    perturbations = presidio_perturb.perturb(
        original_text=text, presidio_response=presidio_response, count=5
    )

    assert len(perturbations) == 5
    for perturbation in perturbations:
        assert fake_df[entity1].str.lower()[0] in perturbation.lower()
        assert fake_df[entity2].str.lower()[0] in perturbation.lower()
        assert text[:start1].lower() in perturbation.lower()
        assert text[end1:start2].lower() in perturbation.lower()

예제 #17

0

파일 보기

def test_entity_translation():
    text = "My email is [email protected]"

    presidio_response = [
        RecognizerResult(entity_type="EMAIL_ADDRESS", start=12, end=27, score=0.5)
    ]

    presidio_perturb = PresidioPerturb(fake_pii_df=get_mock_fake_df())
    fake_df = presidio_perturb.fake_pii
    perturbations = presidio_perturb.perturb(
        original_text=text, presidio_response=presidio_response, count=1
    )

    assert fake_df["EMAIL_ADDRESS"].str.lower()[0] in perturbations[0]

예제 #18

0

파일 보기

파일: test_image_analyzer_engine.py 프로젝트: zhangabner/presidio

def test_given_multiword_entity_then_map_analyzer_returns_correct_bboxes_and_len(
    get_ocr_analyzer_results, ):
    ocr_result, text, recognizer_result = get_ocr_analyzer_results

    # create new object for multiple words entities
    recognizer_result = [RecognizerResult("PERSON", 32, 46, 0.85)]
    expected_result = [
        ImageRecognizerResult("PERSON", 32, 46, 0.85, 896, 64, 183, 40),
        ImageRecognizerResult("PERSON", 32, 46, 0.85, 141, 134, 190, 50),
    ]
    mapped_entities = ImageAnalyzerEngine.map_analyzer_results_to_bounding_boxes(
        recognizer_result, ocr_result, text)

    assert len(expected_result) == len(mapped_entities)
    assert expected_result == mapped_entities

예제 #19

0

파일 보기

    def _convert_to_recognizer_result(
            self, categorized_entity: Dict) -> RecognizerResult:
        entity_type = self._get_presidio_entity_type(categorized_entity)

        return RecognizerResult(
            entity_type=entity_type,
            start=categorized_entity[TextAnalyticsClient.OFFSET],
            end=categorized_entity[TextAnalyticsClient.OFFSET] +
            categorized_entity[TextAnalyticsClient.LENGTH],
            score=categorized_entity[TextAnalyticsClient.CONFIDENCE_SCORE],
            analysis_explanation=TextAnalyticsRecognizer._build_explanation(
                original_score=categorized_entity[
                    TextAnalyticsClient.CONFIDENCE_SCORE],
                entity_type=entity_type,
            ),
        )

예제 #20

0

파일 보기

    def analyze(self, text, entities, nlp_artifacts=None):
        results = []
        if not nlp_artifacts:
            self.logger.warning(
                "Skipping SpaCy, nlp artifacts not provided...")
            return results

        ner_entities = nlp_artifacts.entities

        for entity in entities:
            if entity in self.supported_entities:
                for ent in ner_entities:
                    if SpacyRecognizer.__check_label(entity, ent.label_):
                        explanation = SpacyRecognizer.build_spacy_explanation(
                            self.__class__.__name__,
                            NER_STRENGTH,
                            ent.label_)
                        spacy_result = RecognizerResult(
                            entity, ent.start_char,
                            ent.end_char, NER_STRENGTH, explanation)
                        results.append(spacy_result)

        return results

예제 #21

0

파일 보기

    def analyze(self, text, entities, nlp_artifacts=None):  # noqa D102
        results = []
        if not nlp_artifacts:
            logger.warning("Skipping SpaCy, nlp artifacts not provided...")
            return results

        ner_entities = nlp_artifacts.entities

        for entity in entities:
            if entity not in self.supported_entities:
                continue
            for ent in ner_entities:
                if not self.__check_label(entity, ent.label_, self.check_label_groups):
                    continue
                textual_explanation = self.DEFAULT_EXPLANATION.format(ent.label_)
                explanation = self.build_spacy_explanation(
                    self.ner_strength, textual_explanation
                )
                spacy_result = RecognizerResult(
                    entity, ent.start_char, ent.end_char, self.ner_strength, explanation
                )
                results.append(spacy_result)

        return results