def test_redact_with_pii_and_only_redaction(self): comprehend_client = MagicMock() comprehend_client.contains_pii_entities.return_value = [ Document(text="Some Random text", pii_classification={'SSN': 0.53}) ] comprehend_client.detect_pii_documents.return_value = [ Document(text="Some Random text", pii_classification={'SSN': 0.53}, pii_entities=[{ 'Score': 0.534, 'Type': 'SSN', 'BeginOffset': 0, 'EndOffset': 4 }]) ] document = redact("Some Random text", Segmenter(DOCUMENT_MAX_SIZE_CONTAINS_PII_ENTITIES), Segmenter(DOCUMENT_MAX_SIZE_DETECT_PII_ENTITIES), Redactor(RedactionConfig()), comprehend_client, RedactionConfig(), DEFAULT_LANGUAGE_CODE) comprehend_client.contains_pii_entities.assert_not_called() comprehend_client.detect_pii_documents.assert_called_once() assert document.redacted_text == "**** Random text"
def de_segment(self, segments: List[Document]) -> Document: """ Merge the segments back into one big text. It also merges back the pii classification result. Handles conflicting result on overlapping text between two text segments in the following ways: 1. For pii classification, the maximum thresholds for an entity amongst the segments is updated as the threshold for that entity for the merged document 2. For pii entity annotations, for a conflicting annotation span a higher priority is given to the one with a higher confidence threshold """ merged_text = "" pii_classification = {} pii_entities = [] segments.sort(key=lambda x: x.char_offset) for segment in segments: offset_adjusted_segment = Document( text=segment.text, char_offset=segment.char_offset, pii_entities=self._relocate_annotation(segment.pii_entities, segment.char_offset), pii_classification=segment.pii_classification) self._merge_classifcation_results(segment, pii_classification) self._merge_pii_annotation_results(offset_adjusted_segment, pii_entities) merged_text = merged_text + segment.text[len(merged_text) - segment.char_offset:] return Document(text=merged_text, char_offset=0, pii_classification=pii_classification, pii_entities=pii_entities)
def test_get_interested_pii_true(self): assert len( get_interested_pii( Document(text="Some Random text", pii_classification={'SSN': 0.534}), RedactionConfig())) > 0 assert len( get_interested_pii( Document(text="Some Random text", pii_classification={'SSN': 0.734}), RedactionConfig(pii_entity_types=['SSN'], confidence_threshold=0.7))) > 0
def segment(self, text: str, char_offset=0) -> List[Document]: """Segment the text into segments of max_doc_length with overlap_tokens.""" segments = [] starting_index = 0 while len(text[starting_index:].encode()) > self.max_doc_size: trimmed_text = self._trim_to_max_bytes(text[starting_index:], self.max_doc_size) trimmed_text = self._trim_partial_trailing_word(trimmed_text) segments.append( Document(text=trimmed_text, char_offset=char_offset + starting_index)) starting_index = starting_index + self._find_trailing_overlapping_tokens_start_index( trimmed_text) + 1 # Add the remaining segment if starting_index < len(text) - 1: segments.append( Document(text=text[starting_index:], char_offset=char_offset + starting_index)) return segments
def redact_pii_documents_handler(event, context): """Redaction Lambda function handler.""" LOG.info('Received event with requestId: %s', event[REQUEST_ID]) LOG.debug(f'Raw event {event}') InputEventValidator.validate(event) invoke_args = json.loads(event[S3OL_CONFIGURATION][PAYLOAD]) if event[S3OL_CONFIGURATION][PAYLOAD] else {} language_code = invoke_args.get(LANGUAGE_CODE, DEFAULT_LANGUAGE_CODE) redaction_config = RedactionConfig(**invoke_args) object_get_context = event[GET_OBJECT_CONTEXT] s3ol_access_point = event[S3OL_CONFIGURATION][S3OL_ACCESS_POINT_ARN] s3 = S3Client(s3ol_access_point) cloud_watch = CloudWatchClient() comprehend = ComprehendClient(s3ol_access_point=s3ol_access_point, session_id=event[REQUEST_ID], user_agent=DEFAULT_USER_AGENT, endpoint_url=COMPREHEND_ENDPOINT_URL) exception_handler = ExceptionHandler(s3) LOG.debug("Pii Entity Types to be redacted:" + str(redaction_config.pii_entity_types)) processed_document = False document = Document('') try: def time_bound_task(): nonlocal processed_document nonlocal document PartialObjectRequestValidator.validate(event) pii_classification_segmenter = Segmenter(DOCUMENT_MAX_SIZE_CONTAINS_PII_ENTITIES) pii_redaction_segmenter = Segmenter(DOCUMENT_MAX_SIZE_DETECT_PII_ENTITIES) redactor = Redactor(redaction_config) time1 = time.time() text, http_headers, status_code = s3.download_file_from_presigned_url(object_get_context[INPUT_S3_URL], event[USER_REQUEST][HEADERS]) time2 = time.time() LOG.info(f"Downloaded the file in : {(time2 - time1)} seconds") document = redact(text, pii_classification_segmenter, pii_redaction_segmenter, redactor, comprehend, redaction_config, language_code) processed_document = True time1 = time.time() LOG.info(f"Pii redaction completed within {(time1 - time2)} seconds. Returning back the response to S3") redacted_text_bytes = document.redacted_text.encode('utf-8') http_headers[CONTENT_LENGTH] = len(redacted_text_bytes) s3.respond_back_with_data(redacted_text_bytes, http_headers, object_get_context[REQUEST_ROUTE], object_get_context[REQUEST_TOKEN], status_code) execute_task_with_timeout(context.get_remaining_time_in_millis() - RESERVED_TIME_FOR_CLEANUP, time_bound_task) except Exception as generated_exception: exception_handler.handle_exception(generated_exception, object_get_context[REQUEST_ROUTE], object_get_context[REQUEST_TOKEN]) finally: if PUBLISH_CLOUD_WATCH_METRICS: pii_entities = get_interested_pii(document, redaction_config) publish_metrics(cloud_watch, s3, comprehend, processed_document, len(pii_entities) > 0, language_code, s3ol_access_point, pii_entities) LOG.info("Responded back to s3 successfully")
def _update_doc_with_pii_entities(self, document: Document, language) -> Document: start_time = time.time() response = None try: response = self.comprehend.detect_pii_entities( Text=document.text, LanguageCode=language) finally: if response is not None: self.detection_metrics.add_fault_count( response['ResponseMetadata']['RetryAttempts']) self.detection_metrics.add_latency(start_time, time.time()) # updating the document itself instead of creating a new copy to save space document.pii_entities = response['Entities'] document.pii_classification = { entity['Type']: max(entity['Score'], document.pii_classification[entity['Type']]) if entity['Type'] in document.pii_classification else entity['Score'] for entity in response['Entities'] } return document
def test_classify_with_no_pii(self): comprehend_client = MagicMock() comprehend_client.contains_pii_entities.return_value = [ Document(text="Some Random text", pii_classification={}) ] entities = classify("Some Random text", Segmenter(DOCUMENT_MAX_SIZE_CONTAINS_PII_ENTITIES), comprehend_client, ClassificationConfig(), DEFAULT_LANGUAGE_CODE) comprehend_client.contains_pii_entities.assert_called_once() assert len(entities) == 0
def test_redact_with_no_pii_and_classification(self): comprehend_client = MagicMock() comprehend_client.contains_pii_entities.return_value = [ Document(text="Some Random text", pii_classification={}) ] document = redact("Some Random text", Segmenter(DOCUMENT_MAX_SIZE_CONTAINS_PII_ENTITIES), Segmenter(DOCUMENT_MAX_SIZE_DETECT_PII_ENTITIES), Redactor(RedactionConfig()), comprehend_client, RedactionConfig(), DEFAULT_LANGUAGE_CODE) comprehend_client.contains_pii_entities.assert_called_once() comprehend_client.detect_pii_documents.assert_not_called() assert document.redacted_text == "Some Random text"
def redact(text, classification_segmenter: Segmenter, detection_segmenter: Segmenter, redactor: Redactor, comprehend: ComprehendClient, redaction_config: RedactionConfig, language_code) -> Document: """ Redact pii data from given text. Logic for redacting:- . 1. Segment text into subsegments of reasonable sizes (max doc size supported by comprehend) for doing initial classification 2. For each subsegment , 2.1 call comprehend's classify-pii-document api to determine if it contains any PII data 2.2 if it contains pii then split it to smaller chunks(e.g. <=5KB), else skip to the next subsegment 2.3 for each chunk 2.3.1 call comprehend's detect-pii-entities to extract the pii entities 2.3.2 redact the pii entities from the chunk 2.4 merge all chunks 3. merge all subsegments """ if REDACTION_API_ONLY: doc = Document(text) documents = [doc] docs_for_entity_detection = detection_segmenter.segment(doc.text, doc.char_offset) else: documents = comprehend.contains_pii_entities(classification_segmenter.segment(text), language_code) pii_docs = [doc for doc in documents if len(get_interested_pii(doc, redaction_config)) > 0] if not pii_docs: LOG.debug("Document doesn't have any pii. Nothing to redact.") text = classification_segmenter.de_segment(documents).text return Document(text, redacted_text=text) docs_for_entity_detection = [] for pii_doc in pii_docs: docs_for_entity_detection.extend(detection_segmenter.segment(pii_doc.text, pii_doc.char_offset)) docs_with_pii_entities = comprehend.detect_pii_documents(docs_for_entity_detection, language_code) resultant_doc = classification_segmenter.de_segment(documents + docs_with_pii_entities) assert len(resultant_doc.text) == len(text), "Not able to recover original document after segmentation and desegmentation." redacted_text = redactor.redact(text, resultant_doc.pii_entities) resultant_doc.redacted_text = redacted_text return resultant_doc
def _update_doc_with_pii_classification(self, document: Document, language) -> Document: start_time = time.time() response = None try: response = self.comprehend.contains_pii_entities( Text=document.text, LanguageCode=language) finally: if response is not None: self.classify_metrics.add_fault_count( response['ResponseMetadata']['RetryAttempts']) self.classify_metrics.add_latency(start_time, time.time()) # updating the document itself instead of creating a new copy to save space document.pii_classification = { label['Name']: label['Score'] for label in response['Labels'] } return document
def test_redaction_handler_success_empty_payload(self, s3_client, mocked_redact, cloudwatch): with open( os.path.join(this_module_path, "..", 'data', 'sample_event.json'), 'r') as file_pointer: sample_event = json.load(file_pointer) sample_event['configuration']['payload'] = "" sample_text = "Some Random text" sample_redacted_text = "Some **** text" mocked_s3_client = MagicMock() s3_client.return_value = mocked_s3_client s3_get_object_response_http_headers = {'response-header': 'value2'} expected_response_http_headers = deepcopy( s3_get_object_response_http_headers) expected_response_http_headers[CONTENT_LENGTH] = len( sample_redacted_text.encode('utf-8')) mocked_s3_client.download_file_from_presigned_url.return_value = sample_text, s3_get_object_response_http_headers, \ S3_STATUS_CODES.PARTIAL_CONTENT_206 mocked_redact.return_value = Document( sample_text, redacted_text=sample_redacted_text) mocked_cloudwatch = MagicMock() cloudwatch.return_value = mocked_cloudwatch redact_pii_documents_handler(sample_event, self.mocked_context) mocked_redact.assert_called_once() mocked_s3_client.download_file_from_presigned_url.assert_called_once_with( sample_event[GET_OBJECT_CONTEXT][INPUT_S3_URL], sample_event[USER_REQUEST][HEADERS]) mocked_s3_client.respond_back_with_data.assert_called_once_with( sample_redacted_text.encode('utf-8'), expected_response_http_headers, sample_event[GET_OBJECT_CONTEXT][REQUEST_ROUTE], sample_event[GET_OBJECT_CONTEXT][REQUEST_TOKEN], S3_STATUS_CODES.PARTIAL_CONTENT_206)
def test_desegment_overlapping_results(self): segments = [ Document( text= "Some Random SSN Some Random email-id Some Random name and address and some credit card number", char_offset=0, pii_classification={ 'SSN': 0.234, 'EMAIL': 0.765, 'NAME': 0.124, 'ADDRESS': 0.976 }, pii_entities=[{ 'Score': 0.234, 'Type': 'SSN', 'BeginOffset': 12, 'EndOffset': 36 }, { 'Score': 0.765, 'Type': 'EMAIL', 'BeginOffset': 28, 'EndOffset': 36 }, { 'Score': 0.534, 'Type': 'NAME', 'BeginOffset': 49, 'EndOffset': 53 }, { 'Score': 0.234, 'Type': 'ADDRESS', 'BeginOffset': 58, 'EndOffset': 65 }]), Document( text="Some Random name and address and some credit card number", char_offset=37, pii_classification={ 'SSN': 0.234, 'EMAIL': 0.765, 'USERNAME': 0.424, 'ADDRESS': 0.976 }, pii_entities=[{ 'Score': 0.234, 'Type': 'USERNAME', 'BeginOffset': 12, 'EndOffset': 16 }, { 'Score': 0.634, 'Type': 'ADDRESS', 'BeginOffset': 17, 'EndOffset': 28 }, { 'Score': 0.234, 'Type': 'CREDIT_DEBIT_NUMBER', 'BeginOffset': 38, 'EndOffset': 56 }]) ] segmentor = Segmenter(5000) expected_merged_document = Document( text= "Some Random SSN Some Random email-id Some Random name and address and some credit card number", char_offset=37, pii_classification={ 'SSN': 0.234, 'EMAIL': 0.765, 'NAME': 0.124, 'USERNAME': 0.424, 'ADDRESS': 0.976 }, pii_entities=[{ 'Score': 0.234, 'Type': 'SSN', 'BeginOffset': 12, 'EndOffset': 36 }, { 'Score': 0.765, 'Type': 'EMAIL', 'BeginOffset': 28, 'EndOffset': 36 }, { 'Score': 0.534, 'Type': 'NAME', 'BeginOffset': 49, 'EndOffset': 53 }, { 'Score': 0.634, 'Type': 'ADDRESS', 'BeginOffset': 54, 'EndOffset': 65 }, { 'Score': 0.234, 'Type': 'CREDIT_DEBIT_NUMBER', 'BeginOffset': 75, 'EndOffset': 93 }]) actual_merged_doc = segmentor.de_segment(segments) assert expected_merged_document.text == actual_merged_doc.text assert expected_merged_document.pii_classification == actual_merged_doc.pii_classification assert expected_merged_document.pii_entities == actual_merged_doc.pii_entities
def _time_consuming_call(*args, **kwargs): sleep(5) return Document(sample_text, redacted_text=sample_text)