示例#1
0
    def test_redact_with_pii_and_only_redaction(self):
        comprehend_client = MagicMock()

        comprehend_client.contains_pii_entities.return_value = [
            Document(text="Some Random text", pii_classification={'SSN': 0.53})
        ]
        comprehend_client.detect_pii_documents.return_value = [
            Document(text="Some Random text",
                     pii_classification={'SSN': 0.53},
                     pii_entities=[{
                         'Score': 0.534,
                         'Type': 'SSN',
                         'BeginOffset': 0,
                         'EndOffset': 4
                     }])
        ]

        document = redact("Some Random text",
                          Segmenter(DOCUMENT_MAX_SIZE_CONTAINS_PII_ENTITIES),
                          Segmenter(DOCUMENT_MAX_SIZE_DETECT_PII_ENTITIES),
                          Redactor(RedactionConfig()), comprehend_client,
                          RedactionConfig(), DEFAULT_LANGUAGE_CODE)
        comprehend_client.contains_pii_entities.assert_not_called()
        comprehend_client.detect_pii_documents.assert_called_once()
        assert document.redacted_text == "**** Random text"
 def de_segment(self, segments: List[Document]) -> Document:
     """
     Merge the segments back into one big text. It also merges back the pii classification result.
     Handles conflicting result on overlapping text between two text segments in the following ways:
     1. For pii classification, the maximum thresholds for an entity amongst the segments is
         updated as the threshold for that entity for the merged document
     2. For pii entity annotations, for a conflicting annotation span a higher priority
         is given to the one with a higher confidence threshold
     """
     merged_text = ""
     pii_classification = {}
     pii_entities = []
     segments.sort(key=lambda x: x.char_offset)
     for segment in segments:
         offset_adjusted_segment = Document(
             text=segment.text,
             char_offset=segment.char_offset,
             pii_entities=self._relocate_annotation(segment.pii_entities,
                                                    segment.char_offset),
             pii_classification=segment.pii_classification)
         self._merge_classifcation_results(segment, pii_classification)
         self._merge_pii_annotation_results(offset_adjusted_segment,
                                            pii_entities)
         merged_text = merged_text + segment.text[len(merged_text) -
                                                  segment.char_offset:]
     return Document(text=merged_text,
                     char_offset=0,
                     pii_classification=pii_classification,
                     pii_entities=pii_entities)
示例#3
0
 def test_get_interested_pii_true(self):
     assert len(
         get_interested_pii(
             Document(text="Some Random text",
                      pii_classification={'SSN': 0.534}),
             RedactionConfig())) > 0
     assert len(
         get_interested_pii(
             Document(text="Some Random text",
                      pii_classification={'SSN': 0.734}),
             RedactionConfig(pii_entity_types=['SSN'],
                             confidence_threshold=0.7))) > 0
 def segment(self, text: str, char_offset=0) -> List[Document]:
     """Segment the text into segments of max_doc_length with overlap_tokens."""
     segments = []
     starting_index = 0
     while len(text[starting_index:].encode()) > self.max_doc_size:
         trimmed_text = self._trim_to_max_bytes(text[starting_index:],
                                                self.max_doc_size)
         trimmed_text = self._trim_partial_trailing_word(trimmed_text)
         segments.append(
             Document(text=trimmed_text,
                      char_offset=char_offset + starting_index))
         starting_index = starting_index + self._find_trailing_overlapping_tokens_start_index(
             trimmed_text) + 1
     # Add the remaining segment
     if starting_index < len(text) - 1:
         segments.append(
             Document(text=text[starting_index:],
                      char_offset=char_offset + starting_index))
     return segments
def redact_pii_documents_handler(event, context):
    """Redaction Lambda function handler."""
    LOG.info('Received event with requestId: %s', event[REQUEST_ID])
    LOG.debug(f'Raw event {event}')

    InputEventValidator.validate(event)
    invoke_args = json.loads(event[S3OL_CONFIGURATION][PAYLOAD]) if event[S3OL_CONFIGURATION][PAYLOAD] else {}
    language_code = invoke_args.get(LANGUAGE_CODE, DEFAULT_LANGUAGE_CODE)
    redaction_config = RedactionConfig(**invoke_args)
    object_get_context = event[GET_OBJECT_CONTEXT]
    s3ol_access_point = event[S3OL_CONFIGURATION][S3OL_ACCESS_POINT_ARN]
    s3 = S3Client(s3ol_access_point)
    cloud_watch = CloudWatchClient()
    comprehend = ComprehendClient(s3ol_access_point=s3ol_access_point, session_id=event[REQUEST_ID], user_agent=DEFAULT_USER_AGENT,
                                  endpoint_url=COMPREHEND_ENDPOINT_URL)

    exception_handler = ExceptionHandler(s3)

    LOG.debug("Pii Entity Types to be redacted:" + str(redaction_config.pii_entity_types))
    processed_document = False
    document = Document('')

    try:
        def time_bound_task():
            nonlocal processed_document
            nonlocal document
            PartialObjectRequestValidator.validate(event)
            pii_classification_segmenter = Segmenter(DOCUMENT_MAX_SIZE_CONTAINS_PII_ENTITIES)
            pii_redaction_segmenter = Segmenter(DOCUMENT_MAX_SIZE_DETECT_PII_ENTITIES)
            redactor = Redactor(redaction_config)
            time1 = time.time()
            text, http_headers, status_code = s3.download_file_from_presigned_url(object_get_context[INPUT_S3_URL],
                                                                                  event[USER_REQUEST][HEADERS])
            time2 = time.time()
            LOG.info(f"Downloaded the file in : {(time2 - time1)} seconds")
            document = redact(text, pii_classification_segmenter, pii_redaction_segmenter, redactor,
                              comprehend, redaction_config, language_code)
            processed_document = True
            time1 = time.time()
            LOG.info(f"Pii redaction completed within {(time1 - time2)} seconds. Returning back the response to S3")
            redacted_text_bytes = document.redacted_text.encode('utf-8')
            http_headers[CONTENT_LENGTH] = len(redacted_text_bytes)
            s3.respond_back_with_data(redacted_text_bytes, http_headers, object_get_context[REQUEST_ROUTE],
                                      object_get_context[REQUEST_TOKEN], status_code)

        execute_task_with_timeout(context.get_remaining_time_in_millis() - RESERVED_TIME_FOR_CLEANUP, time_bound_task)
    except Exception as generated_exception:
        exception_handler.handle_exception(generated_exception, object_get_context[REQUEST_ROUTE], object_get_context[REQUEST_TOKEN])
    finally:
        if PUBLISH_CLOUD_WATCH_METRICS:
            pii_entities = get_interested_pii(document, redaction_config)
            publish_metrics(cloud_watch, s3, comprehend, processed_document, len(pii_entities) > 0, language_code,
                            s3ol_access_point, pii_entities)

    LOG.info("Responded back to s3 successfully")
 def _update_doc_with_pii_entities(self, document: Document,
                                   language) -> Document:
     start_time = time.time()
     response = None
     try:
         response = self.comprehend.detect_pii_entities(
             Text=document.text, LanguageCode=language)
     finally:
         if response is not None:
             self.detection_metrics.add_fault_count(
                 response['ResponseMetadata']['RetryAttempts'])
         self.detection_metrics.add_latency(start_time, time.time())
     # updating the document itself instead of creating a new copy to save space
     document.pii_entities = response['Entities']
     document.pii_classification = {
         entity['Type']: max(entity['Score'],
                             document.pii_classification[entity['Type']]) if
         entity['Type'] in document.pii_classification else entity['Score']
         for entity in response['Entities']
     }
     return document
示例#7
0
    def test_classify_with_no_pii(self):
        comprehend_client = MagicMock()

        comprehend_client.contains_pii_entities.return_value = [
            Document(text="Some Random text", pii_classification={})
        ]
        entities = classify("Some Random text",
                            Segmenter(DOCUMENT_MAX_SIZE_CONTAINS_PII_ENTITIES),
                            comprehend_client, ClassificationConfig(),
                            DEFAULT_LANGUAGE_CODE)
        comprehend_client.contains_pii_entities.assert_called_once()
        assert len(entities) == 0
示例#8
0
    def test_redact_with_no_pii_and_classification(self):
        comprehend_client = MagicMock()

        comprehend_client.contains_pii_entities.return_value = [
            Document(text="Some Random text", pii_classification={})
        ]
        document = redact("Some Random text",
                          Segmenter(DOCUMENT_MAX_SIZE_CONTAINS_PII_ENTITIES),
                          Segmenter(DOCUMENT_MAX_SIZE_DETECT_PII_ENTITIES),
                          Redactor(RedactionConfig()), comprehend_client,
                          RedactionConfig(), DEFAULT_LANGUAGE_CODE)
        comprehend_client.contains_pii_entities.assert_called_once()
        comprehend_client.detect_pii_documents.assert_not_called()
        assert document.redacted_text == "Some Random text"
def redact(text, classification_segmenter: Segmenter, detection_segmenter: Segmenter,
           redactor: Redactor, comprehend: ComprehendClient, redaction_config: RedactionConfig, language_code) -> Document:
    """
    Redact pii data from given text. Logic for redacting:- .

    1. Segment text into subsegments of reasonable sizes (max doc size supported by comprehend) for doing initial classification
    2. For each subsegment ,
        2.1 call comprehend's classify-pii-document api to determine if it contains any PII data
        2.2 if it contains pii then split it to smaller chunks(e.g. <=5KB), else skip to the next subsegment
        2.3 for each chunk
             2.3.1 call comprehend's detect-pii-entities to extract the pii entities
             2.3.2 redact the pii entities from the chunk
        2.4 merge all chunks
    3. merge all subsegments
    """
    if REDACTION_API_ONLY:
        doc = Document(text)
        documents = [doc]
        docs_for_entity_detection = detection_segmenter.segment(doc.text, doc.char_offset)
    else:
        documents = comprehend.contains_pii_entities(classification_segmenter.segment(text), language_code)
        pii_docs = [doc for doc in documents if len(get_interested_pii(doc, redaction_config)) > 0]
        if not pii_docs:
            LOG.debug("Document doesn't have any pii. Nothing to redact.")
            text = classification_segmenter.de_segment(documents).text
            return Document(text, redacted_text=text)
        docs_for_entity_detection = []
        for pii_doc in pii_docs:
            docs_for_entity_detection.extend(detection_segmenter.segment(pii_doc.text, pii_doc.char_offset))

    docs_with_pii_entities = comprehend.detect_pii_documents(docs_for_entity_detection, language_code)
    resultant_doc = classification_segmenter.de_segment(documents + docs_with_pii_entities)
    assert len(resultant_doc.text) == len(text), "Not able to recover original document after segmentation and desegmentation."
    redacted_text = redactor.redact(text, resultant_doc.pii_entities)
    resultant_doc.redacted_text = redacted_text
    return resultant_doc
 def _update_doc_with_pii_classification(self, document: Document,
                                         language) -> Document:
     start_time = time.time()
     response = None
     try:
         response = self.comprehend.contains_pii_entities(
             Text=document.text, LanguageCode=language)
     finally:
         if response is not None:
             self.classify_metrics.add_fault_count(
                 response['ResponseMetadata']['RetryAttempts'])
         self.classify_metrics.add_latency(start_time, time.time())
     # updating the document itself instead of creating a new copy to save space
     document.pii_classification = {
         label['Name']: label['Score']
         for label in response['Labels']
     }
     return document
示例#11
0
    def test_redaction_handler_success_empty_payload(self, s3_client,
                                                     mocked_redact,
                                                     cloudwatch):
        with open(
                os.path.join(this_module_path, "..", 'data',
                             'sample_event.json'), 'r') as file_pointer:
            sample_event = json.load(file_pointer)
        sample_event['configuration']['payload'] = ""
        sample_text = "Some Random text"
        sample_redacted_text = "Some **** text"
        mocked_s3_client = MagicMock()
        s3_client.return_value = mocked_s3_client
        s3_get_object_response_http_headers = {'response-header': 'value2'}
        expected_response_http_headers = deepcopy(
            s3_get_object_response_http_headers)
        expected_response_http_headers[CONTENT_LENGTH] = len(
            sample_redacted_text.encode('utf-8'))

        mocked_s3_client.download_file_from_presigned_url.return_value = sample_text, s3_get_object_response_http_headers, \
                                                                         S3_STATUS_CODES.PARTIAL_CONTENT_206
        mocked_redact.return_value = Document(
            sample_text, redacted_text=sample_redacted_text)

        mocked_cloudwatch = MagicMock()
        cloudwatch.return_value = mocked_cloudwatch

        redact_pii_documents_handler(sample_event, self.mocked_context)
        mocked_redact.assert_called_once()
        mocked_s3_client.download_file_from_presigned_url.assert_called_once_with(
            sample_event[GET_OBJECT_CONTEXT][INPUT_S3_URL],
            sample_event[USER_REQUEST][HEADERS])
        mocked_s3_client.respond_back_with_data.assert_called_once_with(
            sample_redacted_text.encode('utf-8'),
            expected_response_http_headers,
            sample_event[GET_OBJECT_CONTEXT][REQUEST_ROUTE],
            sample_event[GET_OBJECT_CONTEXT][REQUEST_TOKEN],
            S3_STATUS_CODES.PARTIAL_CONTENT_206)
示例#12
0
 def test_desegment_overlapping_results(self):
     segments = [
         Document(
             text=
             "Some Random SSN Some Random email-id Some Random name and address and some credit card number",
             char_offset=0,
             pii_classification={
                 'SSN': 0.234,
                 'EMAIL': 0.765,
                 'NAME': 0.124,
                 'ADDRESS': 0.976
             },
             pii_entities=[{
                 'Score': 0.234,
                 'Type': 'SSN',
                 'BeginOffset': 12,
                 'EndOffset': 36
             }, {
                 'Score': 0.765,
                 'Type': 'EMAIL',
                 'BeginOffset': 28,
                 'EndOffset': 36
             }, {
                 'Score': 0.534,
                 'Type': 'NAME',
                 'BeginOffset': 49,
                 'EndOffset': 53
             }, {
                 'Score': 0.234,
                 'Type': 'ADDRESS',
                 'BeginOffset': 58,
                 'EndOffset': 65
             }]),
         Document(
             text="Some Random name and address and some credit card number",
             char_offset=37,
             pii_classification={
                 'SSN': 0.234,
                 'EMAIL': 0.765,
                 'USERNAME': 0.424,
                 'ADDRESS': 0.976
             },
             pii_entities=[{
                 'Score': 0.234,
                 'Type': 'USERNAME',
                 'BeginOffset': 12,
                 'EndOffset': 16
             }, {
                 'Score': 0.634,
                 'Type': 'ADDRESS',
                 'BeginOffset': 17,
                 'EndOffset': 28
             }, {
                 'Score': 0.234,
                 'Type': 'CREDIT_DEBIT_NUMBER',
                 'BeginOffset': 38,
                 'EndOffset': 56
             }])
     ]
     segmentor = Segmenter(5000)
     expected_merged_document = Document(
         text=
         "Some Random SSN Some Random email-id Some Random name and address and some credit card number",
         char_offset=37,
         pii_classification={
             'SSN': 0.234,
             'EMAIL': 0.765,
             'NAME': 0.124,
             'USERNAME': 0.424,
             'ADDRESS': 0.976
         },
         pii_entities=[{
             'Score': 0.234,
             'Type': 'SSN',
             'BeginOffset': 12,
             'EndOffset': 36
         }, {
             'Score': 0.765,
             'Type': 'EMAIL',
             'BeginOffset': 28,
             'EndOffset': 36
         }, {
             'Score': 0.534,
             'Type': 'NAME',
             'BeginOffset': 49,
             'EndOffset': 53
         }, {
             'Score': 0.634,
             'Type': 'ADDRESS',
             'BeginOffset': 54,
             'EndOffset': 65
         }, {
             'Score': 0.234,
             'Type': 'CREDIT_DEBIT_NUMBER',
             'BeginOffset': 75,
             'EndOffset': 93
         }])
     actual_merged_doc = segmentor.de_segment(segments)
     assert expected_merged_document.text == actual_merged_doc.text
     assert expected_merged_document.pii_classification == actual_merged_doc.pii_classification
     assert expected_merged_document.pii_entities == actual_merged_doc.pii_entities
示例#13
0
 def _time_consuming_call(*args, **kwargs):
     sleep(5)
     return Document(sample_text, redacted_text=sample_text)