def testE2eBigquery(self, mock_bq_source_fn, mock_bq_sink_fn, mock_utcnow_fn): def make_sink(table_name, schema, write_disposition): # pylint: disable=unused-argument return beam_testutil.FakeSink(table_name) mock_bq_sink_fn.side_effect = make_sink now = 'current time' mock_utcnow_fn.return_value = now tp_tag = tag_template.format('TypeA', 0, 5) fp_tag = tag_template.format('TypeA', 8, 10) fn_tag = tag_template.format('TypeA', 11, 13) fn2_tag = tag_template.format('TypeA', 15, 19) findings_tags = '\n'.join([tp_tag, fp_tag]) golden_tags = '\n'.join([tp_tag, fn_tag, fn2_tag]) mock_bq_source_fn.return_value = beam_testutil.FakeSource() mock_bq_source_fn.return_value._records = [{ 'findings_record_id': '111-1', 'findings_xml': xml_template.format(findings_tags), 'golden_xml': xml_template.format(golden_tags) }] types_to_ignore = ['ignore'] # These features are tested in testE2eGCS. input_pattern, golden_dir, results_dir, per_note_table, debug_table = ( None, None, None, None, None) mae_input_query = 'SELECT * from [project.dataset.table]' mae_golden_table = 'project.dataset.golden_table' run_pipeline_lib.run_pipeline(input_pattern, golden_dir, results_dir, mae_input_query, mae_golden_table, False, 'results_table', per_note_table, debug_table, types_to_ignore, pipeline_args=None) # Check that we generated the query correctly. mock_bq_source_fn.assert_called_with( query=('SELECT findings.record_id, findings.xml, golden.xml FROM ' '(SELECT * from [project.dataset.table]) AS findings ' 'LEFT JOIN [project.dataset.golden_table] AS golden ' 'ON findings.record_id=golden.record_id')) # Check we wrote the correct results to BigQuery. expected_results = [{ 'info_type': 'ALL', 'recall': 0.333333, 'precision': 0.5, 'f_score': 0.4, 'true_positives': 1, 'false_positives': 1, 'false_negatives': 2 }, { 'info_type': u'TypeA', 'recall': 0.333333, 'precision': 0.5, 'f_score': 0.4, 'true_positives': 1, 'false_positives': 1, 'false_negatives': 2 }] for r in expected_results: r.update({'timestamp': now}) actual_results = sorted(beam_testutil.get_table('results_table'), key=lambda x: x['info_type']) self.assertEqual([normalize_dict_floats(r) for r in expected_results], [normalize_dict_floats(r) for r in actual_results])
def testE2eGCS(self, fake_client_fn, mock_bq_sink_fn, mock_utcnow_fn): def make_sink(table_name, schema, write_disposition): # pylint: disable=unused-argument return beam_testutil.FakeSink(table_name) mock_bq_sink_fn.side_effect = make_sink now = 'current time' mock_utcnow_fn.return_value = now input_pattern = 'gs://bucketname/input/*' golden_dir = 'gs://bucketname/goldens' results_dir = 'gs://bucketname/results' storage_client = testutil.FakeStorageClient() fake_client_fn.return_value = storage_client tp_tag = tag_template.format('TypeA', 0, 5) fp_tag = tag_template.format('TypeA', 8, 10) fn_tag = tag_template.format('TypeA', 11, 13) fn2_tag = tag_template.format('TypeA', 15, 19) findings_tags = '\n'.join([tp_tag, fp_tag]) golden_tags = '\n'.join([tp_tag, fn_tag, fn2_tag]) testutil.set_gcs_file('bucketname/input/1-1.xml', xml_template.format(findings_tags)) testutil.set_gcs_file('bucketname/goldens/1-1.xml', xml_template.format(golden_tags)) tp2_tag = tag_template.format('TypeB', 20, 21) # False negative + false positive for entity matching, but true positive for # binary token matching. entity_fp_tag = tag_template.format('TypeX', 30, 35) entity_fn_tag = tag_template.format('TypeY', 30, 35) # Two tokens are tagged as one in the golden. This is not a match for entity # matching, but is two matches for binary token matching. partial_tag1 = tag_template.format('TypeA', 36, 41) partial_tag2 = tag_template.format('TypeA', 42, 47) partial_tag3 = tag_template.format('TypeA', 48, 54) multi_token_tag = tag_template.format('TypeA', 36, 54) ignored_tag = tag_template.format('ignore', 55, 57) findings_tags = '\n'.join([ tp_tag, tp2_tag, entity_fp_tag, partial_tag1, partial_tag2, partial_tag3, ignored_tag ]) golden_tags = '\n'.join( [tp_tag, tp2_tag, entity_fn_tag, multi_token_tag]) testutil.set_gcs_file('bucketname/input/1-2.xml', xml_template.format(findings_tags)) testutil.set_gcs_file('bucketname/goldens/1-2.xml', xml_template.format(golden_tags)) self.old_write_to_text = beam.io.WriteToText beam.io.WriteToText = beam_testutil.DummyWriteTransform types_to_ignore = ['ignore'] mae_input_query = None mae_golden_table = None run_pipeline_lib.run_pipeline(input_pattern, golden_dir, results_dir, mae_input_query, mae_golden_table, True, 'results_table', 'per_note_results_table', 'debug_output_table', types_to_ignore, pipeline_args=None) beam.io.WriteToText = self.old_write_to_text # Check we wrote the correct results to BigQuery. expected_results = [{ 'info_type': 'ALL', 'recall': 0.7777777777777778, 'precision': 0.875, 'f_score': 0.823529411764706, 'true_positives': 7, 'false_positives': 1, 'false_negatives': 2 }, { 'info_type': u'TypeA', 'recall': 0.7142857142857143, 'precision': 0.8333333333333334, 'f_score': 0.7692307692307694, 'true_positives': 5, 'false_positives': 1, 'false_negatives': 2 }, { 'info_type': u'TypeB', 'recall': 1.0, 'precision': 1.0, 'f_score': 1.0, 'true_positives': 1, 'false_positives': 0, 'false_negatives': 0 }, { 'info_type': u'TypeY', 'recall': 1.0, 'precision': 1.0, 'f_score': 1.0, 'true_positives': 1, 'false_positives': 0, 'false_negatives': 0 }] for r in expected_results: r.update({'timestamp': now}) actual_results = sorted(beam_testutil.get_table('results_table'), key=lambda x: x['info_type']) self.assertEqual([normalize_dict_floats(r) for r in expected_results], [normalize_dict_floats(r) for r in actual_results]) full_text = 'word1 w2 w3 wrd4 5 word6 word7 multi token entity w8' def debug_info(record_id, classification, text, info_type, start, end): location = full_text.find(text) context = (full_text[0:location] + '{[--' + text + '--]}' + full_text[location + len(text):]) return { 'record_id': record_id, 'classification': classification, 'text': text, 'info_type': info_type, 'context': context, 'start': start, 'end': end } expected_debug_info = [ debug_info('1-1', 'true_positive', 'word1', 'TypeA', 0, 5), debug_info('1-1', 'false_positive', 'w2', 'TypeA', 8, 10), debug_info('1-1', 'false_negative', 'w3', 'TypeA', 11, 13), debug_info('1-1', 'false_negative', 'wrd4', 'TypeA', 15, 19), debug_info('1-2', 'true_positive', 'word1', 'TypeA', 0, 5), debug_info('1-2', 'true_positive', '5', 'TypeB', 20, 21), debug_info('1-2', 'true_positive', 'word7', 'TypeY', 30, 35), debug_info('1-2', 'true_positive', 'multi', 'TypeA', 36, 41), debug_info('1-2', 'true_positive', 'token', 'TypeA', 42, 47), debug_info('1-2', 'true_positive', 'entity', 'TypeA', 48, 54), ] for r in expected_debug_info: r.update({'timestamp': now}) def s(l): return sorted(l, key=lambda x: x['record_id'] + x['context']) self.assertEqual(s(expected_debug_info), s(beam_testutil.get_table('debug_output_table'))) expected_per_note = [{ 'record_id': '1-1', 'precision': 0.5, 'recall': 0.3333333333333333, 'f_score': 0.4, 'true_positives': 1, 'false_positives': 1, 'false_negatives': 2 }, { 'record_id': '1-2', 'precision': 1.0, 'recall': 1.0, 'f_score': 1.0, 'true_positives': 6, 'false_positives': 0, 'false_negatives': 0 }] for r in expected_per_note: r.update({'timestamp': now}) actual_results = sorted( beam_testutil.get_table('per_note_results_table'), key=lambda x: x['record_id']) self.assertEqual([normalize_dict_floats(r) for r in expected_per_note], [normalize_dict_floats(r) for r in actual_results]) # Check we wrote the correct results to GCS. expected_text = '' with open(os.path.join(TESTDATA_DIR, 'expected_results')) as f: expected_text = f.read() expected_results = results_pb2.Results() text_format.Merge(expected_text, expected_results) results = results_pb2.Results() text_format.Merge( testutil.get_gcs_file('bucketname/results/aggregate_results.txt'), results) self.assertEqual(normalize_floats(expected_results), normalize_floats(results)) # Check the per-file results were written correctly. expected_result1 = results_pb2.IndividualResult() text_format.Merge( """ record_id: "1-1" stats { true_positives: 1 false_positives: 1 false_negatives: 2 precision: 0.5 recall: 0.333333333333 f_score: 0.4 }""", expected_result1) expected_result2 = results_pb2.IndividualResult() text_format.Merge( """ record_id: "1-2" stats { true_positives: 6 precision: 1.0 recall: 1.0 f_score: 1.0 }""", expected_result2) normalize_floats(expected_result1) normalize_floats(expected_result2) full_text = testutil.get_gcs_file( 'bucketname/results/per-note-results') actual_results = [] for record in sorted(full_text.split('\n\n')): if not record: continue actual_result = results_pb2.IndividualResult() text_format.Merge(record, actual_result) actual_results.append(normalize_floats(actual_result)) self.assertEqual([expected_result1, expected_result2], actual_results)
def testReBatchDeid(self, mock_bq_source_fn, mock_bq_sink_fn, mock_build_fn): mock_bq_sink_fn.side_effect = partial(self.make_sink, _TABLE_TO_SCHEMA) mock_bq_source_fn.return_value = beam_testutil.FakeSource() mock_bq_source_fn.return_value._records = [{ 'first_name': 'Boaty', 'last_name': 'McBoatface', 'note': 'text and PID and MORE PID', 'patient_id': '111', 'record_number': '1' }, { 'first_name': 'Zephod', 'last_name': 'Beeblebrox', 'note': 'note2 text', 'patient_id': '222', 'record_number': '2' }] deid_response1 = { 'item': { 'table': { 'rows': [{ 'values': [ sval('Boaty'), sval('McBoatface'), sval('note1 redacted'), sval('111'), sval('1') ] }], 'headers': DEID_HEADERS } } } deid_response2 = { 'item': { 'table': { 'rows': [{ 'values': [ sval('Zephod'), sval('Beeblebrox'), sval('note2 redacted'), sval('222'), sval('2') ] }], 'headers': DEID_HEADERS } } } empty_locations = [{'recordLocation': {'tableLocation': {}}}] findings1 = { 'findings': [{ 'location': { 'codepointRange': { 'start': '9', 'end': '12' }, 'contentLocations': empty_locations }, 'infoType': { 'name': 'PHONE_NUMBER' } }] } findings2 = { 'findings': [{ 'location': { 'codepointRange': { 'start': '17', 'end': '25' }, 'contentLocations': empty_locations }, 'infoType': { 'name': 'US_MALE_NAME' } }] } inspect_response_truncated = {'result': {'findingsTruncated': 'True'}} inspect_responses = [ inspect_response_truncated, { 'result': findings1 }, { 'result': findings2 } ] def inspect_execute(): response = inspect_responses[inspect_execute.call_count] inspect_execute.call_count += 1 return response inspect_execute.call_count = 0 fake_content = Mock() fake_content.inspect.return_value = Mock(execute=inspect_execute) deid_responses = ['Exception', deid_response1, deid_response2] def deid_execute(): response = deid_responses[deid_execute.call_count] deid_execute.call_count += 1 if response == 'Exception': content = ( '{"error": {"message": "Too many findings to de-identify. ' 'Retry with a smaller request."}}') raise errors.HttpError(httplib2.Response({'status': 400}), content) return response deid_execute.call_count = 0 fake_content.deidentify.return_value = Mock(execute=deid_execute) fake_projects = Mock(content=Mock(return_value=fake_content)) fake_dlp = Mock(projects=Mock(return_value=fake_projects)) mock_build_fn.return_value = fake_dlp query_job = Mock() rows = [[ 'Boaty', 'McBoatface', 'text and PID and MORE PID', '111', '1' ], ['Zephod', 'Beeblebrox', 'note2 text', '222', '2']] results_table = FakeBqResults(bq_schema(), rows) query_job.destination.fetch_data.return_value = results_table bq_client = Mock() bq_client.run_async_query.return_value = query_job deid_cfg_file = os.path.join(TESTDATA_DIR, 'testdata/batch_config.json') run_deid_lib.run_pipeline('input_query', None, 'deid_tbl', 'findings_tbl', 'gs://mae-bucket/mae-dir', 'mae_tbl', deid_cfg_file, 'InspectPhiTask', 'fake-credentials', 'project', testutil.FakeStorageClient, bq_client, None, 'dlp', batch_size=2, dtd_dir=None, pipeline_args=None) expected_request_body = {} with open(os.path.join(TESTDATA_DIR, 'testdata/batch_request.json')) as f: expected_request_body = json.load(f) fake_content.deidentify.assert_called() _, kwargs = fake_content.deidentify.call_args_list[0] self.maxDiff = 10000 self.assertEqual(ordered(expected_request_body), ordered(kwargs['body'])) self.assertEqual(beam_testutil.get_table('deid_tbl'), EXPECTED_DEID_RESULT) self.assertEqual(EXPECTED_MAE1, testutil.get_gcs_file('mae-bucket/mae-dir/111-1.xml')) self.assertEqual(EXPECTED_MAE2, testutil.get_gcs_file('mae-bucket/mae-dir/222-2.xml'))
def testBatchDeid(self, mock_bq_source_fn, mock_bq_sink_fn, mock_build_fn): mock_bq_sink_fn.side_effect = partial(self.make_sink, _TABLE_TO_SCHEMA) mock_bq_source_fn.return_value = beam_testutil.FakeSource() mock_bq_source_fn.return_value._records = [{ 'first_name': 'Boaty', 'last_name': 'McBoatface', 'note': 'text and PID and MORE PID', 'patient_id': '111', 'record_number': '1' }, { 'first_name': 'Zephod', 'last_name': 'Beeblebrox', 'note': 'note2 text', 'patient_id': '222', 'record_number': '2' }] deid_response = { 'item': { 'table': { 'rows': [{ 'values': [ sval('Boaty'), sval('McBoatface'), sval('note1 redacted'), sval('111'), sval('1') ] }, { 'values': [ sval('Zephod'), sval('Beeblebrox'), sval('note2 redacted'), sval('222'), sval('2') ] }], 'headers': DEID_HEADERS } } } findings = { 'findings': [{ 'location': { 'codepointRange': { 'start': '9', 'end': '12' }, 'contentLocations': [{ 'recordLocation': { 'tableLocation': { 'rowIndex': '0' } } }] }, 'infoType': { 'name': 'PHONE_NUMBER' } }, { 'location': { 'codepointRange': { 'start': '17', 'end': '25' }, 'contentLocations': [{ 'recordLocation': { 'tableLocation': { 'rowIndex': '1' } } }] }, 'infoType': { 'name': 'US_MALE_NAME' } }] } inspect_response = {'result': findings} fake_content = Mock() fake_content.inspect.return_value = Mock(execute=Mock( return_value=inspect_response)) fake_content.deidentify.return_value = Mock(execute=Mock( return_value=deid_response)) fake_projects = Mock(content=Mock(return_value=fake_content)) fake_dlp = Mock(projects=Mock(return_value=fake_projects)) mock_build_fn.return_value = fake_dlp query_job = Mock() rows = [[ 'Boaty', 'McBoatface', 'text and PID and MORE PID', '111', '1' ], ['Zephod', 'Beeblebrox', 'note2 text', '222', '2']] results_table = FakeBqResults(bq_schema(), rows) query_job.destination.fetch_data.return_value = results_table bq_client = Mock() bq_client.run_async_query.return_value = query_job deid_cfg_file = os.path.join(TESTDATA_DIR, 'testdata/batch_config.json') run_deid_lib.run_pipeline('input_query', None, 'deid_tbl', 'findings_tbl', 'gs://mae-bucket/mae-dir', 'mae_tbl', deid_cfg_file, 'InspectPhiTask', 'fake-credentials', 'project', testutil.FakeStorageClient, bq_client, None, 'dlp', batch_size=2, dtd_dir=None, pipeline_args=None) expected_request_body = {} with open(os.path.join(TESTDATA_DIR, 'testdata/batch_request.json')) as f: expected_request_body = json.load(f) fake_content.deidentify.assert_called_once() _, kwargs = fake_content.deidentify.call_args self.maxDiff = 10000 self.assertEqual(ordered(expected_request_body), ordered(kwargs['body'])) self.assertEqual(beam_testutil.get_table('deid_tbl'), EXPECTED_DEID_RESULT) self.assertEqual(EXPECTED_MAE1, testutil.get_gcs_file('mae-bucket/mae-dir/111-1.xml')) self.assertEqual(EXPECTED_MAE2, testutil.get_gcs_file('mae-bucket/mae-dir/222-2.xml'))
def testMultiColumnDeid(self, mock_bq_source_fn, mock_bq_sink_fn, mock_build_fn): table_to_schema = _TABLE_TO_SCHEMA.copy() table_to_schema['deid_tbl'] += ', last_name:STRING' mock_bq_sink_fn.side_effect = partial(self.make_sink, table_to_schema) mock_bq_source_fn.return_value = beam_testutil.FakeSource() mock_bq_source_fn.return_value._records = [{ 'first_name': 'Boaty', 'last_name': 'McBoatface', 'note': 'text and PID and MORE PID', 'patient_id': '111', 'record_number': '1' }] deid_response = { 'item': { 'table': { 'rows': [{ 'values': [{ 'stringValue': 'deidtext' }, { 'stringValue': 'myname' }] }], 'headers': [{ 'name': 'note' }, { 'name': 'last_name' }] } } } empty_locations = [{'recordLocation': {'tableLocation': {}}}] findings = { 'findings': [{ 'location': { 'codepointRange': { 'start': '17', 'end': '25' }, 'contentLocations': empty_locations }, 'infoType': { 'name': 'PHONE_NUMBER' } }, { 'location': { 'codepointRange': { 'start': '9', 'end': '12' }, 'contentLocations': empty_locations }, 'infoType': { 'name': 'US_CENSUS_NAME' } }, { 'location': { 'codepointRange': { 'start': '9', 'end': '12' }, 'contentLocations': [{ 'recordLocation': { 'tableLocation': { 'rowIndex': '0' } } }] }, 'infoType': { 'name': 'US_MALE_NAME' } }] } inspect_response = {'result': findings} fake_content = Mock() fake_content.inspect.return_value = Mock(execute=Mock( return_value=inspect_response)) fake_content.deidentify.return_value = Mock(execute=Mock( return_value=deid_response)) fake_projects = Mock(content=Mock(return_value=fake_content)) fake_dlp = Mock(projects=Mock(return_value=fake_projects)) mock_build_fn.return_value = fake_dlp query_job = Mock() rows = [['Boaty', 'McBoatface', 'note', 'id', 'recordnum']] results_table = FakeBqResults(bq_schema(), rows) query_job.destination.fetch_data.return_value = results_table bq_client = Mock() bq_client.run_async_query.return_value = query_job deid_cfg_file = os.path.join(TESTDATA_DIR, 'testdata/multi_column_config.json') mae_dir = '' # Not compatible with multi-column. mae_table = '' # Not compatible with multi-column. run_deid_lib.run_pipeline('input_query', None, 'deid_tbl', 'findings_tbl', mae_dir, mae_table, deid_cfg_file, 'InspectPhiTask', 'fake-credentials', 'project', testutil.FakeStorageClient, bq_client, None, 'dlp', batch_size=1, dtd_dir=None, pipeline_args=None) request_body = {} with open( os.path.join(TESTDATA_DIR, 'testdata/multi_column_request.json')) as f: request_body = json.load(f) fake_content.deidentify.assert_called_once() _, kwargs = fake_content.deidentify.call_args self.maxDiff = 10000 self.assertEqual(ordered(request_body), ordered(kwargs['body'])) self.assertEqual(beam_testutil.get_table('deid_tbl'), [{ 'patient_id': '111', 'record_number': '1', 'note': 'deidtext', 'last_name': 'myname' }])
def testE2E(self, mock_bq_source_fn, mock_bq_sink_fn, mock_build_fn): table_to_schema = _TABLE_TO_SCHEMA.copy() table_to_schema['deid_tbl'] += ', field_transform_col:STRING' mock_bq_sink_fn.side_effect = partial(self.make_sink, table_to_schema) mock_bq_source_fn.return_value = beam_testutil.FakeSource() mock_bq_source_fn.return_value._records = [{ 'first_name': 'Boaty', 'last_name': 'McBoatface', 'note': 'text and PID and MORE PID', 'patient_id': '111', 'record_number': '1', 'field_transform_col': 'transform me!' }] deid_response = { 'item': { 'table': { 'rows': [{ 'values': [{ 'stringValue': 'deid_resp_val' }, { 'stringValue': 'transformed!!' }] }], 'headers': [{ 'name': 'note' }, { 'name': 'field_transform_col' }] } } } empty_locations = [{'recordLocation': {'tableLocation': {}}}] findings = { 'findings': [{ 'location': { 'codepointRange': { 'start': '17', 'end': '25' }, 'contentLocations': empty_locations }, 'infoType': { 'name': 'PHONE_NUMBER' } }, { 'location': { 'codepointRange': { 'start': '9', 'end': '12' }, 'contentLocations': empty_locations }, 'infoType': { 'name': 'US_CENSUS_NAME' } }, { 'location': { 'codepointRange': { 'start': '9', 'end': '12' }, 'contentLocations': empty_locations }, 'infoType': { 'name': 'US_MALE_NAME' } }] } inspect_response = {'result': findings} fake_content = Mock() fake_content.inspect.return_value = Mock(execute=Mock( return_value=inspect_response)) fake_content.deidentify.return_value = Mock(execute=Mock( return_value=deid_response)) fake_projects = Mock(content=Mock(return_value=fake_content)) fake_dlp = Mock(projects=Mock(return_value=fake_projects)) mock_build_fn.return_value = fake_dlp query_job = Mock() rows = [['Boaty', 'McBoatface', 'note', 'id', 'recordnum']] results_table = FakeBqResults(bq_schema(), rows) query_job.destination.fetch_data.return_value = results_table bq_client = Mock() bq_client.run_async_query.return_value = query_job deid_cfg = os.path.join(TESTDATA_DIR, 'testdata/config.json') dtd_dir = tempfile.mkdtemp() run_deid_lib.run_pipeline('input_query', None, 'deid_tbl', 'findings_tbl', 'gs://mae-bucket/mae-dir', 'mae_tbl', deid_cfg, 'InspectPhiTask', 'fake-credentials', 'project', testutil.FakeStorageClient, bq_client, None, 'dlp', batch_size=1, dtd_dir=dtd_dir, pipeline_args=None) request_body = {} with open(os.path.join(TESTDATA_DIR, 'testdata/request.json')) as f: request_body = json.load(f) fake_content.deidentify.assert_called_once() _, kwargs = fake_content.deidentify.call_args self.maxDiff = 10000 self.assertEqual(ordered(request_body), ordered(kwargs['body'])) with open(os.path.join(TESTDATA_DIR, 'mae_testdata', 'sample.xml')) as f: contents = f.read() self.assertEqual( testutil.get_gcs_file('mae-bucket/mae-dir/111-1.xml'), contents) self.assertEqual(beam_testutil.get_table('mae_tbl'), [{ 'record_id': '111-1', 'xml': contents }]) with open(os.path.join(TESTDATA_DIR, 'mae_testdata', 'sample.dtd')) as f: contents = f.read() self.assertEqual( testutil.get_gcs_file('mae-bucket/mae-dir/classification.dtd'), contents) with open(os.path.join(dtd_dir, 'classification.dtd')) as local_dtd: self.assertEqual(local_dtd.read(), contents) self.assertEqual(beam_testutil.get_table('deid_tbl'), [{ 'patient_id': '111', 'record_number': '1', 'note': 'deid_resp_val', 'field_transform_col': 'transformed!!' }]) self.assertEqual(beam_testutil.get_table('findings_tbl'), [{ 'patient_id': '111', 'record_number': '1', 'findings': str(findings) }])