def test_analyze_data_frame_runs_analyze_against_each_cell_with_a_PII_value( self): test_data_frame = pd.DataFrame({ "summary": [ "First President of Singapore NRIC was S0000001I", "A typical email id would look something like [email protected]" ], "phone number": [ "Some examples of phone numbers are +65 62345678", "Some examples of phone numbers are +65 62345678" ] }) actual = self.pii_detector.analyze_data_frame(test_data_frame) expected_data_frame = pd.DataFrame({ "summary": [[AnalyzerResult("S0000001I", "NRIC", 38, 47)], [AnalyzerResult("*****@*****.**", "EMAIL", 45, 60)]], "phone number": [[AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)]] }) pd.testing.assert_frame_equal(expected_data_frame, actual["analyzer_results"])
def test_analyze_data_frame_runs_analyze_only_on_cells_with_a_PII_value( self): test_data_frame = pd.DataFrame({ "summary": [ "First President of Singapore NRIC was S0000001I", "A typical email id would look something like [email protected]" ], "remarks": ["No sensitive data", "No sensitive data"] }) actual_report, actual_result = self.pii_detector.analyze_data_frame( test_data_frame) expected_report = pd.DataFrame({ "summary": [[AnalyzerResult("S0000001I", "NRIC", 38, 47)], [AnalyzerResult("*****@*****.**", "EMAIL", 45, 60)]], "remarks": [[], []] }) expected_result = pd.DataFrame({ "summary": [ "First President of Singapore NRIC was ", "A typical email id would look something like " ], "remarks": ["No sensitive data", "No sensitive data"] }) pd.testing.assert_frame_equal(expected_report, actual_report) pd.testing.assert_frame_equal(expected_result, actual_result)
def test_redact_for_multiple_analyzer_results(self): text = "text containing pii1 and pii2" analyzer_results = [ AnalyzerResult("pii1", "PII_DETECTOR", 16, 19), AnalyzerResult("pii2", "PII_DETECTOR", 25, 28) ] result = DropAnonymizer.redact(text, analyzer_results) self.assertEqual(result, "text containing and ")
def test_execute_returns_all_matches_when_more_than_one(self): results = self.test_class.execute( "First President of Singapore NRIC was S0000001I and the second president's was T0000001R" ) self.assertEqual(len(results), 2) self.assertCountEqual([ AnalyzerResult("S0000001I", "NRIC", 38, 47), AnalyzerResult("T0000001R", "NRIC", 79, 88) ], results)
def test_calculate_detector_stats_returns_detector_counts_and_percentages( self): result_column_values = pd.Series( [[AnalyzerResult("S0000001I", "NRIC", 38, 47)], [AnalyzerResult("*****@*****.**", "EMAIL", 45, 60)], [AnalyzerResult("*****@*****.**", "EMAIL", 45, 60)]]) actual_result = self.report_generator_medium_level.calculate_detector_stats_for_each_column( result_column_values) expected_result = {"NRIC": (1, "33.33%"), "EMAIL": (2, "66.67%")} self.assertCountEqual(expected_result, actual_result)
def test_get_pii_list_returns_list_of_pii_words_given_row_of_list_of_analyzer_results( self): test_row = Row(summary=[ AnalyzerResult("S0000001I", "NRIC", 38, 47), AnalyzerResult("S0000002I", "NRIC", 38, 47) ], phone_number=[ AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47) ]) actual = self.pii_detector._get_pii_list(test_row) expected = ["S0000001I", "S0000002I", "+65 62345678"] self.assertEqual(actual, expected)
def test_should_detect_and_redact_all_pii_fields_in_text(self): actual = self.pii_detector.analyze_and_redact( """First President of Singapore NRIC was S0000001I. A typical email id would look something like [email protected]""" ) expected_redacted_text = """First President of Singapore NRIC was . A typical email id would look something like """ expected = AnonymizerResult(expected_redacted_text, [ AnalyzerResult("*****@*****.**", "EMAIL", 135, 150), AnalyzerResult("S0000001I", "NRIC", 38, 47) ]) self.assertEqual(actual, expected)
def test_high_level_reporting_returns_columns_with_PII_values_when_given_a_results_data_frame( self): result_data_frame = pd.DataFrame({ "summary": [[AnalyzerResult("S0000001I", "NRIC", 38, 47)], [AnalyzerResult("*****@*****.**", "EMAIL", 45, 60)]], "phone number": [[AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)]] }) expected_data_frame = pd.DataFrame( {"Columns with PII values": ["summary", "phone number"]}) self.assertCountEqual( expected_data_frame, self.report_generator_high_level.generate_report_content( result_data_frame))
def test_should_detect_and_redact_email_in_text(self): actual = self.pii_detector.analyze_and_redact( "A typical email id would look something like [email protected]") expected = AnonymizerResult( "A typical email id would look something like ", [AnalyzerResult("*****@*****.**", "EMAIL", 45, 60)]) self.assertEqual(actual, expected)
def test_should_detect_and_redact_nric_in_text(self): actual = self.pii_detector.analyze_and_redact( "First President of Singapore NRIC was S0000001I") expected = AnonymizerResult( "First President of Singapore NRIC was ", [AnalyzerResult("S0000001I", "NRIC", 38, 47)]) self.assertEqual(actual, expected)
def test_generate_report_calls_content_generate_report_content_and_logs_it( self, mock_generate_content, mock_logging): result_data_frame = pd.DataFrame({ "summary": [[AnalyzerResult("S0000001I", "NRIC", 38, 47)], [AnalyzerResult("*****@*****.**", "EMAIL", 45, 60)]], "phone number": [[AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)]] }) mock_generate_content.return_value = pd.DataFrame( {"Columns with PII values": ["summary", "phone number"]}) mock_logging.return_value = None expected_result = self.report_generator_high_level.generate( result_data_frame) self.assertCountEqual(expected_result, mock_generate_content.return_value)
def test_should_detect_and_redact_phone_in_text(self): actual = self.pii_detector.analyze_and_redact( "Some examples of phone numbers are +65 62345678") expected = AnonymizerResult( "Some examples of phone numbers are ", [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)]) self.assertEqual(actual, expected)
def test_inequality(self): self.assertNotEqual(AnalyzerResult("text", "type", 0, 10), AnalyzerResult("different_text", "type", 0, 10)) self.assertNotEqual(AnalyzerResult("text", "type", 0, 10), AnalyzerResult("text", "different_type", 0, 10)) self.assertNotEqual(AnalyzerResult("text", "type", 0, 10), AnalyzerResult("text", "type", 1, 10)) self.assertNotEqual(AnalyzerResult("text", "type", 0, 10), AnalyzerResult("text", "type", 0, 11))
def execute(self, text): results = [] matches = re.finditer(self.get_pattern(), text) for match in matches: matched_string = match.string[match.start(): match.end()] if self.validate(matched_string): results.append(AnalyzerResult(matched_string, self.get_name(), match.start(), match.end())) return results
def test_analyze_data_frame_runs_analyze_against_each_cell_with_a_PII_value( self): test_data_frame = self.SPARK.createDataFrame( [("First President of Singapore NRIC was S0000001I", "Some examples of phone numbers are +65 62345678"), ("A typical email id would look something like [email protected]", "Some examples of phone numbers are +65 62345678")], ["summary", "phone number"]) actual = self.pii_detector.get_analyzer_results(test_data_frame) expected_data_frame = self.SPARK.createDataFrame( [([AnalyzerResult("S0000001I", "NRIC", 38, 47) ], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)]), ([AnalyzerResult("*****@*****.**", "EMAIL", 45, 60) ], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)])], self.schema) self.assertEqual(actual.schema, expected_data_frame.schema) self.assertEqual(actual.collect(), expected_data_frame.collect())
def test_get_redacted_text_returns_redacted_data_frame(self): test_report_data_frame = self.SPARK.createDataFrame( [([AnalyzerResult("S0000001I", "NRIC", 38, 47) ], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)]), ([ AnalyzerResult("*****@*****.**", "EMAIL", 6, 21), AnalyzerResult("+65 62345678", "PHONE_NUMBER", 32, 44) ], [ AnalyzerResult("+65 62345678", "PHONE_NUMBER", 10, 22), AnalyzerResult("+65 62345678", "PHONE_NUMBER", 33, 45) ])], self.schema) test_input_data_frame = self.SPARK.createDataFrame( [("First President of Singapore NRIC was S0000001I", "Some examples of phone numbers are +65 62345678"), ("email [email protected] and phone +65 62345678", "Phone one +65 62345678 Phone two +65 62345678")], ["summary", "phone number"]) actual = self.pii_detector.get_redacted_text(test_input_data_frame, test_report_data_frame) expected = self.SPARK.createDataFrame( [("First President of Singapore NRIC was ", "Some examples of phone numbers are "), ("email and phone ", "Phone one Phone two ")], ["summary", "phone number"]) self.assertEqual(actual.schema, expected.schema) self.assertEqual(actual.collect(), expected.collect())
def test_medium_level_reporting_returns_data_frame_with_detectors_and_column_details( self): result_data_frame = pd.DataFrame({ "summary": [[AnalyzerResult("S0000001I", "NRIC", 38, 47)], [AnalyzerResult("*****@*****.**", "EMAIL", 45, 60)]], "phone number": [[AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)]] }) expected_data_frame = pd.DataFrame({ "summary": pd.Series({ "NRIC": (1, "50%"), "EMAIL": (1, "50%") }), "phone number": pd.Series({"PHONE_NUMBER": (2, "100%")}) }) self.assertCountEqual( list(expected_data_frame), self.report_generator_medium_level.generate_report_content( result_data_frame))
def test_analyze_data_frame_runs_analyze_against_cell_with_multiple_PII_values( self): test_data_frame = self.SPARK.createDataFrame( [("First President of Singapore NRIC was S0000001I", "Some examples of phone numbers are +65 62345678"), ("email [email protected] and phone +65 62345678", "Phone one +65 62345678 Phone two +65 62345678")], ["summary", "phone number"]) actual = self.pii_detector.get_analyzer_results(test_data_frame) expected_data_frame = self.SPARK.createDataFrame( [([AnalyzerResult("S0000001I", "NRIC", 38, 47) ], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)]), ([ AnalyzerResult("*****@*****.**", "EMAIL", 6, 21), AnalyzerResult("+65 62345678", "PHONE_NUMBER", 32, 44) ], [ AnalyzerResult("+65 62345678", "PHONE_NUMBER", 10, 22), AnalyzerResult("+65 62345678", "PHONE_NUMBER", 33, 45) ])], self.schema) self.assertEqual(actual.schema, expected_data_frame.schema) self.assertEqual(actual.collect(), expected_data_frame.collect())
def __assert_single_result(self, text_to_be_tested, start, end): actual = self.phone_number_detector.execute(text_to_be_tested) expected = AnalyzerResult(text_to_be_tested, "PHONE_NUMBER", start, end) self.assertEqual(len(actual), 1) self.assertEqual(expected, actual[0])
def test_redact_for_single_analyzer_result(self): text = "text containing pii" analyzer_results = [AnalyzerResult("pii", "PII_DETECTOR", 16, 18)] result = DropAnonymizer.redact(text, analyzer_results) self.assertEqual(result, "text containing ")
def test_str(self): expected = "Text sample_data at position (0,10) was identified as type" self.assertEqual(str(AnalyzerResult("sample_data", "type", 0, 10)), expected)
def test_get_detector_fetches_detector_type_correctly(self): result = AnalyzerResult("text", "EMAIL", 0, 10) self.assertEqual(result.detector(), "EMAIL")
def test_execute_calls_match_and_validate(self): results = self.test_class.execute( "First President of Singapore NRIC was S0000001I") self.assertEqual(len(results), 1) self.assertEqual(AnalyzerResult("S0000001I", "NRIC", 38, 47), results[0])
def test_equality(self): expected = AnalyzerResult("text", "type", 0, 10) actual = AnalyzerResult("text", "type", 0, 10) self.assertEqual(expected, actual)