def test_redaction_default_redaction_config(self): text = "Hello Zhang Wei. Your AnyCompany Financial Services, LLC credit card account 1111-0000-1111-0000 has a minimum payment of $24.53" redactor = Redactor(RedactionConfig()) redacted_text = redactor.redact(text, [{ 'Score': 0.234, 'Type': 'NAME', 'BeginOffset': 6, 'EndOffset': 16 }, { 'Score': 0.765, 'Type': 'CREDIT_DEBIT_NUMBER', 'BeginOffset': 77, 'EndOffset': 96 }]) expected_redaction = "Hello Zhang Wei. Your AnyCompany Financial Services, LLC credit card account ******************* has a minimum payment of $24.53" assert expected_redaction == redacted_text
def test_redact_with_pii_and_only_redaction(self): comprehend_client = MagicMock() comprehend_client.contains_pii_entities.return_value = [ Document(text="Some Random text", pii_classification={'SSN': 0.53}) ] comprehend_client.detect_pii_documents.return_value = [ Document(text="Some Random text", pii_classification={'SSN': 0.53}, pii_entities=[{ 'Score': 0.534, 'Type': 'SSN', 'BeginOffset': 0, 'EndOffset': 4 }]) ] document = redact("Some Random text", Segmenter(DOCUMENT_MAX_SIZE_CONTAINS_PII_ENTITIES), Segmenter(DOCUMENT_MAX_SIZE_DETECT_PII_ENTITIES), Redactor(RedactionConfig()), comprehend_client, RedactionConfig(), DEFAULT_LANGUAGE_CODE) comprehend_client.contains_pii_entities.assert_not_called() comprehend_client.detect_pii_documents.assert_called_once() assert document.redacted_text == "**** Random text"
def test_redaction_with_replace_entity_type(self): text = "Hello Zhang Wei. Your AnyCompany Financial Services, LLC credit card account 1111-0000-1111-0000 has a minimum payment of $24.53" redactor = Redactor( RedactionConfig(pii_entity_types=['NAME'], mask_mode=REPLACE_WITH_PII_ENTITY_TYPE, confidence_threshold=0.6)) redacted_text = redactor.redact(text, [{ 'Score': 0.634, 'Type': 'NAME', 'BeginOffset': 6, 'EndOffset': 15 }, { 'Score': 0.765, 'Type': 'CREDIT_DEBIT_NUMBER', 'BeginOffset': 77, 'EndOffset': 96 }]) expected_redaction = "Hello [NAME]. Your AnyCompany Financial Services, LLC credit card account 1111-0000-1111-0000 has a minimum payment of $24.53" assert expected_redaction == redacted_text
def test_redact_with_no_pii_and_classification(self): comprehend_client = MagicMock() comprehend_client.contains_pii_entities.return_value = [ Document(text="Some Random text", pii_classification={}) ] document = redact("Some Random text", Segmenter(DOCUMENT_MAX_SIZE_CONTAINS_PII_ENTITIES), Segmenter(DOCUMENT_MAX_SIZE_DETECT_PII_ENTITIES), Redactor(RedactionConfig()), comprehend_client, RedactionConfig(), DEFAULT_LANGUAGE_CODE) comprehend_client.contains_pii_entities.assert_called_once() comprehend_client.detect_pii_documents.assert_not_called() assert document.redacted_text == "Some Random text"
def time_bound_task(): nonlocal processed_document nonlocal document PartialObjectRequestValidator.validate(event) pii_classification_segmenter = Segmenter(DOCUMENT_MAX_SIZE_CONTAINS_PII_ENTITIES) pii_redaction_segmenter = Segmenter(DOCUMENT_MAX_SIZE_DETECT_PII_ENTITIES) redactor = Redactor(redaction_config) time1 = time.time() text, http_headers, status_code = s3.download_file_from_presigned_url(object_get_context[INPUT_S3_URL], event[USER_REQUEST][HEADERS]) time2 = time.time() LOG.info(f"Downloaded the file in : {(time2 - time1)} seconds") document = redact(text, pii_classification_segmenter, pii_redaction_segmenter, redactor, comprehend, redaction_config, language_code) processed_document = True time1 = time.time() LOG.info(f"Pii redaction completed within {(time1 - time2)} seconds. Returning back the response to S3") redacted_text_bytes = document.redacted_text.encode('utf-8') http_headers[CONTENT_LENGTH] = len(redacted_text_bytes) s3.respond_back_with_data(redacted_text_bytes, http_headers, object_get_context[REQUEST_ROUTE], object_get_context[REQUEST_TOKEN], status_code)
def redact(text, classification_segmenter: Segmenter, detection_segmenter: Segmenter, redactor: Redactor, comprehend: ComprehendClient, redaction_config: RedactionConfig, language_code) -> Document: """ Redact pii data from given text. Logic for redacting:- . 1. Segment text into subsegments of reasonable sizes (max doc size supported by comprehend) for doing initial classification 2. For each subsegment , 2.1 call comprehend's classify-pii-document api to determine if it contains any PII data 2.2 if it contains pii then split it to smaller chunks(e.g. <=5KB), else skip to the next subsegment 2.3 for each chunk 2.3.1 call comprehend's detect-pii-entities to extract the pii entities 2.3.2 redact the pii entities from the chunk 2.4 merge all chunks 3. merge all subsegments """ if REDACTION_API_ONLY: doc = Document(text) documents = [doc] docs_for_entity_detection = detection_segmenter.segment(doc.text, doc.char_offset) else: documents = comprehend.contains_pii_entities(classification_segmenter.segment(text), language_code) pii_docs = [doc for doc in documents if len(get_interested_pii(doc, redaction_config)) > 0] if not pii_docs: LOG.debug("Document doesn't have any pii. Nothing to redact.") text = classification_segmenter.de_segment(documents).text return Document(text, redacted_text=text) docs_for_entity_detection = [] for pii_doc in pii_docs: docs_for_entity_detection.extend(detection_segmenter.segment(pii_doc.text, pii_doc.char_offset)) docs_with_pii_entities = comprehend.detect_pii_documents(docs_for_entity_detection, language_code) resultant_doc = classification_segmenter.de_segment(documents + docs_with_pii_entities) assert len(resultant_doc.text) == len(text), "Not able to recover original document after segmentation and desegmentation." redacted_text = redactor.redact(text, resultant_doc.pii_entities) resultant_doc.redacted_text = redacted_text return resultant_doc
def test_redaction_with_no_entities(self): text = "Hello Zhang Wei. Your AnyCompany Financial Services, LLC credit card account 1111-0000-1111-0000 has a minimum payment of $24.53" redactor = Redactor(RedactionConfig()) redacted_text = redactor.redact(text, []) assert text == redacted_text