Пример #1
0
def test_presidio_psudonymize_two_entities(text, entity1, entity2, start1,
                                           end1, start2, end2, value1, value2,
                                           fake_faker):

    presidio_response = [
        RecognizerResult(entity_type=entity1,
                         start=start1,
                         end=end1,
                         score=0.85),
        RecognizerResult(entity_type=entity2,
                         start=start2,
                         end=end2,
                         score=0.85),
    ]
    presidio_pseudonymizer = PresidioPseudonymization(
        custom_faker=fake_faker,
        lower_case_ratio=0.0,
        map_to_presidio_entities=False)
    pseudonyms = presidio_pseudonymizer.pseudonymize(
        original_text=text, presidio_response=presidio_response, count=5)

    assert len(pseudonyms) == 5
    for pseudonym in pseudonyms:
        assert value1 in pseudonym
        assert value2 in pseudonym
        assert text[:start1].lower() in pseudonym.lower()
        assert text[end1:start2].lower() in pseudonym.lower()
def test_remove_duplicates():
    # test same result with different score will return only the highest
    arr = [
        RecognizerResult(
            start=0,
            end=5,
            score=0.1,
            entity_type="x",
            analysis_explanation=AnalysisExplanation(
                recognizer="test",
                original_score=0,
                pattern_name="test",
                pattern="test",
                validation_result=None,
            ),
        ),
        RecognizerResult(
            start=0,
            end=5,
            score=0.5,
            entity_type="x",
            analysis_explanation=AnalysisExplanation(
                recognizer="test",
                original_score=0,
                pattern_name="test",
                pattern="test",
                validation_result=None,
            ),
        ),
    ]
    results = AnalyzerEngine._AnalyzerEngine__remove_duplicates(arr)
    assert len(results) == 1
    assert results[0].score == 0.5
Пример #3
0
def test_when_remove_duplicates_different_then_entity_not_removed():
    # test same result with different score will return only the highest
    arr = [
        RecognizerResult(
            start=0,
            end=5,
            score=0.1,
            entity_type="x",
            analysis_explanation=AnalysisExplanation(
                recognizer="test",
                original_score=0,
                pattern_name="test",
                pattern="test",
                validation_result=None,
            ),
        ),
        RecognizerResult(
            start=0,
            end=5,
            score=0.5,
            entity_type="y",
            analysis_explanation=AnalysisExplanation(
                recognizer="test",
                original_score=0,
                pattern_name="test",
                pattern="test",
                validation_result=None,
            ),
        ),
    ]
    results = EntityRecognizer.remove_duplicates(arr)
    assert len(results) == 2
Пример #4
0
 def test_remove_duplicates_different_entity_no_removal(self):
     # test same result with different score will return only the highest
     arr = [
         RecognizerResult(start=0,
                          end=5,
                          score=0.1,
                          entity_type="x",
                          analysis_explanation=AnalysisExplanation(
                              recognizer='test',
                              original_score=0,
                              pattern_name='test',
                              pattern='test',
                              validation_result=None)),
         RecognizerResult(start=0,
                          end=5,
                          score=0.5,
                          entity_type="y",
                          analysis_explanation=AnalysisExplanation(
                              recognizer='test',
                              original_score=0,
                              pattern_name='test',
                              pattern='test',
                              validation_result=None))
     ]
     results = AnalyzerEngine._AnalyzerEngine__remove_duplicates(arr)
     assert len(results) == 2
Пример #5
0
def test_given_text_with_pii_using_package_then_analyze_and_anonymize_complete_successfully():
    text_to_test = "John Smith drivers license is AC432223"

    expected_response = [RecognizerResult("PERSON", 0, 10, 0.85),
                         RecognizerResult("US_DRIVER_LICENSE", 30, 38, 0.6499999999999999)
                         ]
    # Create configuration containing engine name and models
    configuration = {
        "nlp_engine_name": "spacy",
        "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
    }

    # Create NLP engine based on configuration
    provider = NlpEngineProvider(nlp_configuration=configuration)
    nlp_engine = provider.create_engine()

    # Pass the created NLP engine and supported_languages to the AnalyzerEngine
    analyzer = AnalyzerEngine(
        nlp_engine=nlp_engine,
        supported_languages=["en"]
    )
    analyzer_results = analyzer.analyze(text_to_test, "en")
    for i in range(len(analyzer_results)):
        assert analyzer_results[i] == expected_response[i]

    expected_response = AnonymizerResult(text="<PERSON> drivers license is <US_DRIVER_LICENSE>")
    expected_response.add_item(AnonymizedEntity("replace", "US_DRIVER_LICENSE", 28, 47, "<US_DRIVER_LICENSE>"))
    expected_response.add_item(AnonymizedEntity("replace", "PERSON", 0, 8, "<PERSON>"))

    anonymizer = AnonymizerEngine()
    anonymizer_results = anonymizer.anonymize(text_to_test, analyzer_results)
    assert anonymizer_results == expected_response
Пример #6
0
    def __analyze_patterns(
        self, text: str, flags: int = None
    ) -> List[RecognizerResult]:
        """
        Evaluate all patterns in the provided text.

        Including words in the provided deny-list

        :param text: text to analyze
        :param flags: regex flags
        :return: A list of RecognizerResult
        """
        flags = flags if flags else re.DOTALL | re.MULTILINE
        results = []
        for pattern in self.patterns:
            match_start_time = datetime.datetime.now()
            matches = re.finditer(pattern.regex, text, flags=flags)
            match_time = datetime.datetime.now() - match_start_time
            logger.debug(
                "--- match_time[%s]: %s.%s seconds",
                pattern.name,
                match_time.seconds,
                match_time.microseconds,
            )

            for match in matches:
                start, end = match.span()
                current_match = text[start:end]

                # Skip empty results
                if current_match == "":
                    continue

                score = pattern.score

                validation_result = self.validate_result(current_match)
                description = self.build_regex_explanation(
                    self.name, pattern.name, pattern.regex, score, validation_result
                )
                pattern_result = RecognizerResult(
                    self.supported_entities[0], start, end, score, description
                )

                if validation_result is not None:
                    if validation_result:
                        pattern_result.score = EntityRecognizer.MAX_SCORE
                    else:
                        pattern_result.score = EntityRecognizer.MIN_SCORE

                invalidation_result = self.invalidate_result(current_match)
                if invalidation_result is not None and invalidation_result:
                    pattern_result.score = EntityRecognizer.MIN_SCORE

                if pattern_result.score > EntityRecognizer.MIN_SCORE:
                    results.append(pattern_result)

        results = EntityRecognizer.remove_duplicates(results)
        return results
Пример #7
0
def test_assert_result_within_score_range_uses_given_range():
    result = RecognizerResult(ENTITY_TYPE, 0, 10, 0.3)
    assert_result_within_score_range(result, ENTITY_TYPE, 0, 10, 0.2, 0.4)

    result = RecognizerResult(ENTITY_TYPE, 0, 10, 0.1)
    assert_result_within_score_range(result, ENTITY_TYPE, 0, 10, 0.05, 0.15)

    result = RecognizerResult(ENTITY_TYPE, 0, 10, 0.9)
    assert_result_within_score_range(result, ENTITY_TYPE, 0, 10, 0.89, 0.91)
Пример #8
0
    def __analyze_patterns(self, text):
        """
        Evaluates all patterns in the provided text, including words in
         the provided blacklist

        In a sentence we could get a false positive at the end of our regex, were we
        want to find the IBAN but not the false positive at the end of the match.

        i.e. "I want my deposit in DE89370400440532013000 2 days from today."

        :param text: text to analyze
        :param flags: regex flags
        :return: A list of RecognizerResult
        """
        results = []
        for pattern in self.patterns:
            matches = re.finditer(pattern.regex, text, flags=self.flags)

            for match in matches:
                for grp_num in reversed(range(1, len(match.groups()) + 1)):
                    start = match.span(0)[0]
                    end = (
                        match.span(grp_num)[1]
                        if match.span(grp_num)[1] > 0
                        else match.span(0)[1]
                    )
                    current_match = text[start:end]

                    # Skip empty results
                    if current_match == "":
                        continue

                    score = pattern.score

                    validation_result = self.validate_result(current_match)
                    description = PatternRecognizer.build_regex_explanation(
                        self.name, pattern.name, pattern.regex, score, validation_result
                    )
                    pattern_result = RecognizerResult(
                        self.supported_entities[0], start, end, score, description
                    )

                    if validation_result is not None:
                        if validation_result:
                            pattern_result.score = EntityRecognizer.MAX_SCORE
                        else:
                            pattern_result.score = EntityRecognizer.MIN_SCORE

                    if pattern_result.score > EntityRecognizer.MIN_SCORE:
                        results.append(pattern_result)
                        break

        return results
Пример #9
0
def test_assert_result_within_score_range_uses_given_range_fails():

    with pytest.raises(AssertionError):
        result = RecognizerResult(ENTITY_TYPE, 0, 10, 0.3)
        assert_result_within_score_range(result, ENTITY_TYPE, 0, 10, 0.4, 0.6)

    with pytest.raises(AssertionError):
        result = RecognizerResult(ENTITY_TYPE, 0, 10, 0)
        assert_result_within_score_range(result, ENTITY_TYPE, 0, 10, 0.4, 0.6)

    with pytest.raises(AssertionError):
        result = RecognizerResult(ENTITY_TYPE, 0, 10, 1)
        assert_result_within_score_range(result, ENTITY_TYPE, 0, 10, 0, 0.5)
Пример #10
0
    def __analyze_patterns(self, text):
        """
        Evaluates all patterns in the provided text, including words in
         the provided blacklist

        :param text: text to analyze
        :return: A list of RecognizerResult
        """
        results = []
        for pattern in self.patterns:
            match_start_time = datetime.datetime.now()
            matches = re.finditer(pattern.regex,
                                  text,
                                  flags=re.IGNORECASE | re.DOTALL
                                  | re.MULTILINE)
            match_time = datetime.datetime.now() - match_start_time
            self.logger.debug('--- match_time[%s]: %s.%s seconds',
                              pattern.name, match_time.seconds,
                              match_time.microseconds)

            for match in matches:
                start, end = match.span()
                current_match = text[start:end]

                # Skip empty results
                if current_match == '':
                    continue

                score = pattern.score

                validation_result = self.validate_result(current_match)
                description = PatternRecognizer.build_regex_explanation(
                    self.name, pattern.name, pattern.regex, score,
                    validation_result)
                pattern_result = RecognizerResult(self.supported_entities[0],
                                                  start, end, score,
                                                  description)

                if validation_result is not None:
                    if validation_result:
                        pattern_result.score = EntityRecognizer.MAX_SCORE
                    else:
                        pattern_result.score = EntityRecognizer.MIN_SCORE

                if pattern_result.score > EntityRecognizer.MIN_SCORE:
                    results.append(pattern_result)

        return results
Пример #11
0
    def convert_to_analyze_response(self, json_obj):
        result = []

        # None means the recognizer failed to load.
        if json_obj is None:
            return result
        self.logger.info(json_obj)

        svc_response = json_obj
        if not svc_response['documents']:
            if svc_response['errors']:
                self.logger.error('Text Analytics returned error: {}'
                                  .format(str(svc_response['errors'])))
                return result

        for entity in svc_response['documents'][0]['entities']:
            entity_type = TextAnalyticsRecognizer.__convert_to_presidio_type(
                entity['type'],
                subtype=entity.get('subtype')
                )

            recognizer_result = \
                RecognizerResult(entity_type,
                                 entity['offset'],
                                 entity['offset'] + entity['length'],
                                 entity['score'],
                                 self.DEFAULT_EXPLANATION.format(entity['type']))
            result.append(recognizer_result)
        return result
Пример #12
0
    def _recognizer_results_from_response(
        response: requests.Response, ) -> List[RecognizerResult]:
        """Translate the service's response to a list of RecognizerResult."""
        results = json.loads(response.text)
        recognizer_results = [RecognizerResult(**result) for result in results]

        return recognizer_results
Пример #13
0
def test_subset_perturbation():
    text = "My name is Dan"
    presidio_response = [
        RecognizerResult(entity_type="PERSON", start=11, end=14, score=0.5)
    ]

    fake_df = pd.DataFrame({
        "FIRST_NAME": ["Neta", "George"],
        "LAST_NAME": ["Levy", "Harrison"],
        "GENDER": ["Female", "Male"],
        "NameSet": ["Hebrew", "English"],
    })
    ignore_types = ("DATE", "LOCATION", "ADDRESS", "GENDER")

    presidio_perturb = PresidioPerturb(fake_pii_df=fake_df,
                                       ignore_types=ignore_types)

    perturbations = presidio_perturb.perturb(
        original_text=text,
        presidio_response=presidio_response,
        namesets=["Hebrew"],
        genders=["Female"],
        count=5,
    )
    for pert in perturbations:
        assert "neta" in pert.lower()
Пример #14
0
def create_recognizer_result(entity_type: str, score: float, start: int,
                             end: int):
    data = {
        "entity_type": entity_type,
        "score": score,
        "start": start,
        "end": end
    }
    return RecognizerResult.from_json(data)
def test_given_repeat_entities_then_map_analyzer_results_returns_correct_no_of_bboxes(
    get_ocr_analyzer_results, ):
    ocr_result, text, recognizer_result = get_ocr_analyzer_results

    ocr_result["text"][1] = "Katie"
    recognizer_result.append(RecognizerResult("PERSON", 1, 6, 0.85))
    text = " Katie Interiors was created by Katie  Cromley."

    assert (len(
        ImageAnalyzerEngine.map_analyzer_results_to_bounding_boxes(
            recognizer_result, ocr_result, text)) == 3)
Пример #16
0
def test_presidio_perturb_two_entities(
    text, entity1, entity2, start1, end1, start2, end2
):

    presidio_response = [
        RecognizerResult(entity_type=entity1, start=start1, end=end1, score=0.85),
        RecognizerResult(entity_type=entity2, start=start2, end=end2, score=0.85),
    ]
    presidio_perturb = PresidioPerturb(fake_pii_df=get_mock_fake_df())
    fake_df = presidio_perturb.fake_pii
    perturbations = presidio_perturb.perturb(
        original_text=text, presidio_response=presidio_response, count=5
    )

    assert len(perturbations) == 5
    for perturbation in perturbations:
        assert fake_df[entity1].str.lower()[0] in perturbation.lower()
        assert fake_df[entity2].str.lower()[0] in perturbation.lower()
        assert text[:start1].lower() in perturbation.lower()
        assert text[end1:start2].lower() in perturbation.lower()
Пример #17
0
def test_entity_translation():
    text = "My email is [email protected]"

    presidio_response = [
        RecognizerResult(entity_type="EMAIL_ADDRESS", start=12, end=27, score=0.5)
    ]

    presidio_perturb = PresidioPerturb(fake_pii_df=get_mock_fake_df())
    fake_df = presidio_perturb.fake_pii
    perturbations = presidio_perturb.perturb(
        original_text=text, presidio_response=presidio_response, count=1
    )

    assert fake_df["EMAIL_ADDRESS"].str.lower()[0] in perturbations[0]
def test_given_multiword_entity_then_map_analyzer_returns_correct_bboxes_and_len(
    get_ocr_analyzer_results, ):
    ocr_result, text, recognizer_result = get_ocr_analyzer_results

    # create new object for multiple words entities
    recognizer_result = [RecognizerResult("PERSON", 32, 46, 0.85)]
    expected_result = [
        ImageRecognizerResult("PERSON", 32, 46, 0.85, 896, 64, 183, 40),
        ImageRecognizerResult("PERSON", 32, 46, 0.85, 141, 134, 190, 50),
    ]
    mapped_entities = ImageAnalyzerEngine.map_analyzer_results_to_bounding_boxes(
        recognizer_result, ocr_result, text)

    assert len(expected_result) == len(mapped_entities)
    assert expected_result == mapped_entities
Пример #19
0
    def _convert_to_recognizer_result(
            self, categorized_entity: Dict) -> RecognizerResult:
        entity_type = self._get_presidio_entity_type(categorized_entity)

        return RecognizerResult(
            entity_type=entity_type,
            start=categorized_entity[TextAnalyticsClient.OFFSET],
            end=categorized_entity[TextAnalyticsClient.OFFSET] +
            categorized_entity[TextAnalyticsClient.LENGTH],
            score=categorized_entity[TextAnalyticsClient.CONFIDENCE_SCORE],
            analysis_explanation=TextAnalyticsRecognizer._build_explanation(
                original_score=categorized_entity[
                    TextAnalyticsClient.CONFIDENCE_SCORE],
                entity_type=entity_type,
            ),
        )
Пример #20
0
    def analyze(self, text, entities, nlp_artifacts=None):
        results = []
        if not nlp_artifacts:
            self.logger.warning(
                "Skipping SpaCy, nlp artifacts not provided...")
            return results

        ner_entities = nlp_artifacts.entities

        for entity in entities:
            if entity in self.supported_entities:
                for ent in ner_entities:
                    if SpacyRecognizer.__check_label(entity, ent.label_):
                        explanation = SpacyRecognizer.build_spacy_explanation(
                            self.__class__.__name__,
                            NER_STRENGTH,
                            ent.label_)
                        spacy_result = RecognizerResult(
                            entity, ent.start_char,
                            ent.end_char, NER_STRENGTH, explanation)
                        results.append(spacy_result)

        return results
Пример #21
0
    def analyze(self, text, entities, nlp_artifacts=None):  # noqa D102
        results = []
        if not nlp_artifacts:
            logger.warning("Skipping SpaCy, nlp artifacts not provided...")
            return results

        ner_entities = nlp_artifacts.entities

        for entity in entities:
            if entity not in self.supported_entities:
                continue
            for ent in ner_entities:
                if not self.__check_label(entity, ent.label_, self.check_label_groups):
                    continue
                textual_explanation = self.DEFAULT_EXPLANATION.format(ent.label_)
                explanation = self.build_spacy_explanation(
                    self.ner_strength, textual_explanation
                )
                spacy_result = RecognizerResult(
                    entity, ent.start_char, ent.end_char, self.ner_strength, explanation
                )
                results.append(spacy_result)

        return results