Exemplo n.º 1
0
 def test_redaction_default_redaction_config(self):
     text = "Hello Zhang Wei. Your AnyCompany Financial Services, LLC credit card account 1111-0000-1111-0000 has a minimum payment of $24.53"
     redactor = Redactor(RedactionConfig())
     redacted_text = redactor.redact(text, [{
         'Score': 0.234,
         'Type': 'NAME',
         'BeginOffset': 6,
         'EndOffset': 16
     }, {
         'Score': 0.765,
         'Type': 'CREDIT_DEBIT_NUMBER',
         'BeginOffset': 77,
         'EndOffset': 96
     }])
     expected_redaction = "Hello Zhang Wei. Your AnyCompany Financial Services, LLC credit card account ******************* has a minimum payment of $24.53"
     assert expected_redaction == redacted_text
Exemplo n.º 2
0
    def test_redact_with_pii_and_only_redaction(self):
        comprehend_client = MagicMock()

        comprehend_client.contains_pii_entities.return_value = [
            Document(text="Some Random text", pii_classification={'SSN': 0.53})
        ]
        comprehend_client.detect_pii_documents.return_value = [
            Document(text="Some Random text",
                     pii_classification={'SSN': 0.53},
                     pii_entities=[{
                         'Score': 0.534,
                         'Type': 'SSN',
                         'BeginOffset': 0,
                         'EndOffset': 4
                     }])
        ]

        document = redact("Some Random text",
                          Segmenter(DOCUMENT_MAX_SIZE_CONTAINS_PII_ENTITIES),
                          Segmenter(DOCUMENT_MAX_SIZE_DETECT_PII_ENTITIES),
                          Redactor(RedactionConfig()), comprehend_client,
                          RedactionConfig(), DEFAULT_LANGUAGE_CODE)
        comprehend_client.contains_pii_entities.assert_not_called()
        comprehend_client.detect_pii_documents.assert_called_once()
        assert document.redacted_text == "**** Random text"
Exemplo n.º 3
0
 def test_redaction_with_replace_entity_type(self):
     text = "Hello Zhang Wei. Your AnyCompany Financial Services, LLC credit card account 1111-0000-1111-0000 has a minimum payment of $24.53"
     redactor = Redactor(
         RedactionConfig(pii_entity_types=['NAME'],
                         mask_mode=REPLACE_WITH_PII_ENTITY_TYPE,
                         confidence_threshold=0.6))
     redacted_text = redactor.redact(text, [{
         'Score': 0.634,
         'Type': 'NAME',
         'BeginOffset': 6,
         'EndOffset': 15
     }, {
         'Score': 0.765,
         'Type': 'CREDIT_DEBIT_NUMBER',
         'BeginOffset': 77,
         'EndOffset': 96
     }])
     expected_redaction = "Hello [NAME]. Your AnyCompany Financial Services, LLC credit card account 1111-0000-1111-0000 has a minimum payment of $24.53"
     assert expected_redaction == redacted_text
Exemplo n.º 4
0
    def test_redact_with_no_pii_and_classification(self):
        comprehend_client = MagicMock()

        comprehend_client.contains_pii_entities.return_value = [
            Document(text="Some Random text", pii_classification={})
        ]
        document = redact("Some Random text",
                          Segmenter(DOCUMENT_MAX_SIZE_CONTAINS_PII_ENTITIES),
                          Segmenter(DOCUMENT_MAX_SIZE_DETECT_PII_ENTITIES),
                          Redactor(RedactionConfig()), comprehend_client,
                          RedactionConfig(), DEFAULT_LANGUAGE_CODE)
        comprehend_client.contains_pii_entities.assert_called_once()
        comprehend_client.detect_pii_documents.assert_not_called()
        assert document.redacted_text == "Some Random text"
 def time_bound_task():
     nonlocal processed_document
     nonlocal document
     PartialObjectRequestValidator.validate(event)
     pii_classification_segmenter = Segmenter(DOCUMENT_MAX_SIZE_CONTAINS_PII_ENTITIES)
     pii_redaction_segmenter = Segmenter(DOCUMENT_MAX_SIZE_DETECT_PII_ENTITIES)
     redactor = Redactor(redaction_config)
     time1 = time.time()
     text, http_headers, status_code = s3.download_file_from_presigned_url(object_get_context[INPUT_S3_URL],
                                                                           event[USER_REQUEST][HEADERS])
     time2 = time.time()
     LOG.info(f"Downloaded the file in : {(time2 - time1)} seconds")
     document = redact(text, pii_classification_segmenter, pii_redaction_segmenter, redactor,
                       comprehend, redaction_config, language_code)
     processed_document = True
     time1 = time.time()
     LOG.info(f"Pii redaction completed within {(time1 - time2)} seconds. Returning back the response to S3")
     redacted_text_bytes = document.redacted_text.encode('utf-8')
     http_headers[CONTENT_LENGTH] = len(redacted_text_bytes)
     s3.respond_back_with_data(redacted_text_bytes, http_headers, object_get_context[REQUEST_ROUTE],
                               object_get_context[REQUEST_TOKEN], status_code)
def redact(text, classification_segmenter: Segmenter, detection_segmenter: Segmenter,
           redactor: Redactor, comprehend: ComprehendClient, redaction_config: RedactionConfig, language_code) -> Document:
    """
    Redact pii data from given text. Logic for redacting:- .

    1. Segment text into subsegments of reasonable sizes (max doc size supported by comprehend) for doing initial classification
    2. For each subsegment ,
        2.1 call comprehend's classify-pii-document api to determine if it contains any PII data
        2.2 if it contains pii then split it to smaller chunks(e.g. <=5KB), else skip to the next subsegment
        2.3 for each chunk
             2.3.1 call comprehend's detect-pii-entities to extract the pii entities
             2.3.2 redact the pii entities from the chunk
        2.4 merge all chunks
    3. merge all subsegments
    """
    if REDACTION_API_ONLY:
        doc = Document(text)
        documents = [doc]
        docs_for_entity_detection = detection_segmenter.segment(doc.text, doc.char_offset)
    else:
        documents = comprehend.contains_pii_entities(classification_segmenter.segment(text), language_code)
        pii_docs = [doc for doc in documents if len(get_interested_pii(doc, redaction_config)) > 0]
        if not pii_docs:
            LOG.debug("Document doesn't have any pii. Nothing to redact.")
            text = classification_segmenter.de_segment(documents).text
            return Document(text, redacted_text=text)
        docs_for_entity_detection = []
        for pii_doc in pii_docs:
            docs_for_entity_detection.extend(detection_segmenter.segment(pii_doc.text, pii_doc.char_offset))

    docs_with_pii_entities = comprehend.detect_pii_documents(docs_for_entity_detection, language_code)
    resultant_doc = classification_segmenter.de_segment(documents + docs_with_pii_entities)
    assert len(resultant_doc.text) == len(text), "Not able to recover original document after segmentation and desegmentation."
    redacted_text = redactor.redact(text, resultant_doc.pii_entities)
    resultant_doc.redacted_text = redacted_text
    return resultant_doc
Exemplo n.º 7
0
 def test_redaction_with_no_entities(self):
     text = "Hello Zhang Wei. Your AnyCompany Financial Services, LLC credit card account 1111-0000-1111-0000 has a minimum payment of $24.53"
     redactor = Redactor(RedactionConfig())
     redacted_text = redactor.redact(text, [])
     assert text == redacted_text