def testNonMatchingNotes(self, mock_bq_source_fn, mock_bq_sink_fn): # Pipeline raises an exception if the note text does not match the golden # note text. def make_sink(table_name, schema, write_disposition): # pylint: disable=unused-argument return beam_testutil.FakeSink(table_name) mock_bq_sink_fn.side_effect = make_sink xml_text_template = """<?xml version="1.0" encoding="UTF-8" ?> <InspectPhiTask> <TEXT><![CDATA[{0}]]></TEXT> <TAGS></TAGS></InspectPhiTask> """ mock_bq_source_fn.return_value = beam_testutil.FakeSource() mock_bq_source_fn.return_value._records = [{ 'findings_record_id': '111-1', 'findings_xml': xml_text_template.format('some text'), 'golden_xml': xml_text_template.format('different text') }] (input_pattern, golden_dir, results_dir, per_note_table, debug_table, types_to_ignore, pipeline_args) = None, None, None, None, None, None, None mae_input_query = 'SELECT * from [project.dataset.table]' mae_golden_table = 'project.dataset.golden_table' self.assertRaisesRegexp( Exception, 'Note text is different from golden for record \"111-1\"', run_pipeline_lib.run_pipeline, input_pattern, golden_dir, results_dir, mae_input_query, mae_golden_table, False, 'results_table', per_note_table, debug_table, types_to_ignore, pipeline_args)
def testE2eBigquery(self, mock_bq_source_fn, mock_bq_sink_fn, mock_utcnow_fn): def make_sink(table_name, schema, write_disposition): # pylint: disable=unused-argument return beam_testutil.FakeSink(table_name) mock_bq_sink_fn.side_effect = make_sink now = 'current time' mock_utcnow_fn.return_value = now tp_tag = tag_template.format('TypeA', 0, 5) fp_tag = tag_template.format('TypeA', 8, 10) fn_tag = tag_template.format('TypeA', 11, 13) fn2_tag = tag_template.format('TypeA', 15, 19) findings_tags = '\n'.join([tp_tag, fp_tag]) golden_tags = '\n'.join([tp_tag, fn_tag, fn2_tag]) mock_bq_source_fn.return_value = beam_testutil.FakeSource() mock_bq_source_fn.return_value._records = [{ 'findings_record_id': '111-1', 'findings_xml': xml_template.format(findings_tags), 'golden_xml': xml_template.format(golden_tags) }] types_to_ignore = ['ignore'] # These features are tested in testE2eGCS. input_pattern, golden_dir, results_dir, per_note_table, debug_table = ( None, None, None, None, None) mae_input_query = 'SELECT * from [project.dataset.table]' mae_golden_table = 'project.dataset.golden_table' run_pipeline_lib.run_pipeline(input_pattern, golden_dir, results_dir, mae_input_query, mae_golden_table, False, 'results_table', per_note_table, debug_table, types_to_ignore, pipeline_args=None) # Check that we generated the query correctly. mock_bq_source_fn.assert_called_with( query=('SELECT findings.record_id, findings.xml, golden.xml FROM ' '(SELECT * from [project.dataset.table]) AS findings ' 'LEFT JOIN [project.dataset.golden_table] AS golden ' 'ON findings.record_id=golden.record_id')) # Check we wrote the correct results to BigQuery. expected_results = [{ 'info_type': 'ALL', 'recall': 0.333333, 'precision': 0.5, 'f_score': 0.4, 'true_positives': 1, 'false_positives': 1, 'false_negatives': 2 }, { 'info_type': u'TypeA', 'recall': 0.333333, 'precision': 0.5, 'f_score': 0.4, 'true_positives': 1, 'false_positives': 1, 'false_negatives': 2 }] for r in expected_results: r.update({'timestamp': now}) actual_results = sorted(beam_testutil.get_table('results_table'), key=lambda x: x['info_type']) self.assertEqual([normalize_dict_floats(r) for r in expected_results], [normalize_dict_floats(r) for r in actual_results])
def testReBatchDeid(self, mock_bq_source_fn, mock_bq_sink_fn, mock_build_fn): mock_bq_sink_fn.side_effect = partial(self.make_sink, _TABLE_TO_SCHEMA) mock_bq_source_fn.return_value = beam_testutil.FakeSource() mock_bq_source_fn.return_value._records = [{ 'first_name': 'Boaty', 'last_name': 'McBoatface', 'note': 'text and PID and MORE PID', 'patient_id': '111', 'record_number': '1' }, { 'first_name': 'Zephod', 'last_name': 'Beeblebrox', 'note': 'note2 text', 'patient_id': '222', 'record_number': '2' }] deid_response1 = { 'item': { 'table': { 'rows': [{ 'values': [ sval('Boaty'), sval('McBoatface'), sval('note1 redacted'), sval('111'), sval('1') ] }], 'headers': DEID_HEADERS } } } deid_response2 = { 'item': { 'table': { 'rows': [{ 'values': [ sval('Zephod'), sval('Beeblebrox'), sval('note2 redacted'), sval('222'), sval('2') ] }], 'headers': DEID_HEADERS } } } empty_locations = [{'recordLocation': {'tableLocation': {}}}] findings1 = { 'findings': [{ 'location': { 'codepointRange': { 'start': '9', 'end': '12' }, 'contentLocations': empty_locations }, 'infoType': { 'name': 'PHONE_NUMBER' } }] } findings2 = { 'findings': [{ 'location': { 'codepointRange': { 'start': '17', 'end': '25' }, 'contentLocations': empty_locations }, 'infoType': { 'name': 'US_MALE_NAME' } }] } inspect_response_truncated = {'result': {'findingsTruncated': 'True'}} inspect_responses = [ inspect_response_truncated, { 'result': findings1 }, { 'result': findings2 } ] def inspect_execute(): response = inspect_responses[inspect_execute.call_count] inspect_execute.call_count += 1 return response inspect_execute.call_count = 0 fake_content = Mock() fake_content.inspect.return_value = Mock(execute=inspect_execute) deid_responses = ['Exception', deid_response1, deid_response2] def deid_execute(): response = deid_responses[deid_execute.call_count] deid_execute.call_count += 1 if response == 'Exception': content = ( '{"error": {"message": "Too many findings to de-identify. ' 'Retry with a smaller request."}}') raise errors.HttpError(httplib2.Response({'status': 400}), content) return response deid_execute.call_count = 0 fake_content.deidentify.return_value = Mock(execute=deid_execute) fake_projects = Mock(content=Mock(return_value=fake_content)) fake_dlp = Mock(projects=Mock(return_value=fake_projects)) mock_build_fn.return_value = fake_dlp query_job = Mock() rows = [[ 'Boaty', 'McBoatface', 'text and PID and MORE PID', '111', '1' ], ['Zephod', 'Beeblebrox', 'note2 text', '222', '2']] results_table = FakeBqResults(bq_schema(), rows) query_job.destination.fetch_data.return_value = results_table bq_client = Mock() bq_client.run_async_query.return_value = query_job deid_cfg_file = os.path.join(TESTDATA_DIR, 'testdata/batch_config.json') run_deid_lib.run_pipeline('input_query', None, 'deid_tbl', 'findings_tbl', 'gs://mae-bucket/mae-dir', 'mae_tbl', deid_cfg_file, 'InspectPhiTask', 'fake-credentials', 'project', testutil.FakeStorageClient, bq_client, None, 'dlp', batch_size=2, dtd_dir=None, pipeline_args=None) expected_request_body = {} with open(os.path.join(TESTDATA_DIR, 'testdata/batch_request.json')) as f: expected_request_body = json.load(f) fake_content.deidentify.assert_called() _, kwargs = fake_content.deidentify.call_args_list[0] self.maxDiff = 10000 self.assertEqual(ordered(expected_request_body), ordered(kwargs['body'])) self.assertEqual(beam_testutil.get_table('deid_tbl'), EXPECTED_DEID_RESULT) self.assertEqual(EXPECTED_MAE1, testutil.get_gcs_file('mae-bucket/mae-dir/111-1.xml')) self.assertEqual(EXPECTED_MAE2, testutil.get_gcs_file('mae-bucket/mae-dir/222-2.xml'))
def testBatchDeid(self, mock_bq_source_fn, mock_bq_sink_fn, mock_build_fn): mock_bq_sink_fn.side_effect = partial(self.make_sink, _TABLE_TO_SCHEMA) mock_bq_source_fn.return_value = beam_testutil.FakeSource() mock_bq_source_fn.return_value._records = [{ 'first_name': 'Boaty', 'last_name': 'McBoatface', 'note': 'text and PID and MORE PID', 'patient_id': '111', 'record_number': '1' }, { 'first_name': 'Zephod', 'last_name': 'Beeblebrox', 'note': 'note2 text', 'patient_id': '222', 'record_number': '2' }] deid_response = { 'item': { 'table': { 'rows': [{ 'values': [ sval('Boaty'), sval('McBoatface'), sval('note1 redacted'), sval('111'), sval('1') ] }, { 'values': [ sval('Zephod'), sval('Beeblebrox'), sval('note2 redacted'), sval('222'), sval('2') ] }], 'headers': DEID_HEADERS } } } findings = { 'findings': [{ 'location': { 'codepointRange': { 'start': '9', 'end': '12' }, 'contentLocations': [{ 'recordLocation': { 'tableLocation': { 'rowIndex': '0' } } }] }, 'infoType': { 'name': 'PHONE_NUMBER' } }, { 'location': { 'codepointRange': { 'start': '17', 'end': '25' }, 'contentLocations': [{ 'recordLocation': { 'tableLocation': { 'rowIndex': '1' } } }] }, 'infoType': { 'name': 'US_MALE_NAME' } }] } inspect_response = {'result': findings} fake_content = Mock() fake_content.inspect.return_value = Mock(execute=Mock( return_value=inspect_response)) fake_content.deidentify.return_value = Mock(execute=Mock( return_value=deid_response)) fake_projects = Mock(content=Mock(return_value=fake_content)) fake_dlp = Mock(projects=Mock(return_value=fake_projects)) mock_build_fn.return_value = fake_dlp query_job = Mock() rows = [[ 'Boaty', 'McBoatface', 'text and PID and MORE PID', '111', '1' ], ['Zephod', 'Beeblebrox', 'note2 text', '222', '2']] results_table = FakeBqResults(bq_schema(), rows) query_job.destination.fetch_data.return_value = results_table bq_client = Mock() bq_client.run_async_query.return_value = query_job deid_cfg_file = os.path.join(TESTDATA_DIR, 'testdata/batch_config.json') run_deid_lib.run_pipeline('input_query', None, 'deid_tbl', 'findings_tbl', 'gs://mae-bucket/mae-dir', 'mae_tbl', deid_cfg_file, 'InspectPhiTask', 'fake-credentials', 'project', testutil.FakeStorageClient, bq_client, None, 'dlp', batch_size=2, dtd_dir=None, pipeline_args=None) expected_request_body = {} with open(os.path.join(TESTDATA_DIR, 'testdata/batch_request.json')) as f: expected_request_body = json.load(f) fake_content.deidentify.assert_called_once() _, kwargs = fake_content.deidentify.call_args self.maxDiff = 10000 self.assertEqual(ordered(expected_request_body), ordered(kwargs['body'])) self.assertEqual(beam_testutil.get_table('deid_tbl'), EXPECTED_DEID_RESULT) self.assertEqual(EXPECTED_MAE1, testutil.get_gcs_file('mae-bucket/mae-dir/111-1.xml')) self.assertEqual(EXPECTED_MAE2, testutil.get_gcs_file('mae-bucket/mae-dir/222-2.xml'))
def testMultiColumnDeid(self, mock_bq_source_fn, mock_bq_sink_fn, mock_build_fn): table_to_schema = _TABLE_TO_SCHEMA.copy() table_to_schema['deid_tbl'] += ', last_name:STRING' mock_bq_sink_fn.side_effect = partial(self.make_sink, table_to_schema) mock_bq_source_fn.return_value = beam_testutil.FakeSource() mock_bq_source_fn.return_value._records = [{ 'first_name': 'Boaty', 'last_name': 'McBoatface', 'note': 'text and PID and MORE PID', 'patient_id': '111', 'record_number': '1' }] deid_response = { 'item': { 'table': { 'rows': [{ 'values': [{ 'stringValue': 'deidtext' }, { 'stringValue': 'myname' }] }], 'headers': [{ 'name': 'note' }, { 'name': 'last_name' }] } } } empty_locations = [{'recordLocation': {'tableLocation': {}}}] findings = { 'findings': [{ 'location': { 'codepointRange': { 'start': '17', 'end': '25' }, 'contentLocations': empty_locations }, 'infoType': { 'name': 'PHONE_NUMBER' } }, { 'location': { 'codepointRange': { 'start': '9', 'end': '12' }, 'contentLocations': empty_locations }, 'infoType': { 'name': 'US_CENSUS_NAME' } }, { 'location': { 'codepointRange': { 'start': '9', 'end': '12' }, 'contentLocations': [{ 'recordLocation': { 'tableLocation': { 'rowIndex': '0' } } }] }, 'infoType': { 'name': 'US_MALE_NAME' } }] } inspect_response = {'result': findings} fake_content = Mock() fake_content.inspect.return_value = Mock(execute=Mock( return_value=inspect_response)) fake_content.deidentify.return_value = Mock(execute=Mock( return_value=deid_response)) fake_projects = Mock(content=Mock(return_value=fake_content)) fake_dlp = Mock(projects=Mock(return_value=fake_projects)) mock_build_fn.return_value = fake_dlp query_job = Mock() rows = [['Boaty', 'McBoatface', 'note', 'id', 'recordnum']] results_table = FakeBqResults(bq_schema(), rows) query_job.destination.fetch_data.return_value = results_table bq_client = Mock() bq_client.run_async_query.return_value = query_job deid_cfg_file = os.path.join(TESTDATA_DIR, 'testdata/multi_column_config.json') mae_dir = '' # Not compatible with multi-column. mae_table = '' # Not compatible with multi-column. run_deid_lib.run_pipeline('input_query', None, 'deid_tbl', 'findings_tbl', mae_dir, mae_table, deid_cfg_file, 'InspectPhiTask', 'fake-credentials', 'project', testutil.FakeStorageClient, bq_client, None, 'dlp', batch_size=1, dtd_dir=None, pipeline_args=None) request_body = {} with open( os.path.join(TESTDATA_DIR, 'testdata/multi_column_request.json')) as f: request_body = json.load(f) fake_content.deidentify.assert_called_once() _, kwargs = fake_content.deidentify.call_args self.maxDiff = 10000 self.assertEqual(ordered(request_body), ordered(kwargs['body'])) self.assertEqual(beam_testutil.get_table('deid_tbl'), [{ 'patient_id': '111', 'record_number': '1', 'note': 'deidtext', 'last_name': 'myname' }])
def testE2E(self, mock_bq_source_fn, mock_bq_sink_fn, mock_build_fn): table_to_schema = _TABLE_TO_SCHEMA.copy() table_to_schema['deid_tbl'] += ', field_transform_col:STRING' mock_bq_sink_fn.side_effect = partial(self.make_sink, table_to_schema) mock_bq_source_fn.return_value = beam_testutil.FakeSource() mock_bq_source_fn.return_value._records = [{ 'first_name': 'Boaty', 'last_name': 'McBoatface', 'note': 'text and PID and MORE PID', 'patient_id': '111', 'record_number': '1', 'field_transform_col': 'transform me!' }] deid_response = { 'item': { 'table': { 'rows': [{ 'values': [{ 'stringValue': 'deid_resp_val' }, { 'stringValue': 'transformed!!' }] }], 'headers': [{ 'name': 'note' }, { 'name': 'field_transform_col' }] } } } empty_locations = [{'recordLocation': {'tableLocation': {}}}] findings = { 'findings': [{ 'location': { 'codepointRange': { 'start': '17', 'end': '25' }, 'contentLocations': empty_locations }, 'infoType': { 'name': 'PHONE_NUMBER' } }, { 'location': { 'codepointRange': { 'start': '9', 'end': '12' }, 'contentLocations': empty_locations }, 'infoType': { 'name': 'US_CENSUS_NAME' } }, { 'location': { 'codepointRange': { 'start': '9', 'end': '12' }, 'contentLocations': empty_locations }, 'infoType': { 'name': 'US_MALE_NAME' } }] } inspect_response = {'result': findings} fake_content = Mock() fake_content.inspect.return_value = Mock(execute=Mock( return_value=inspect_response)) fake_content.deidentify.return_value = Mock(execute=Mock( return_value=deid_response)) fake_projects = Mock(content=Mock(return_value=fake_content)) fake_dlp = Mock(projects=Mock(return_value=fake_projects)) mock_build_fn.return_value = fake_dlp query_job = Mock() rows = [['Boaty', 'McBoatface', 'note', 'id', 'recordnum']] results_table = FakeBqResults(bq_schema(), rows) query_job.destination.fetch_data.return_value = results_table bq_client = Mock() bq_client.run_async_query.return_value = query_job deid_cfg = os.path.join(TESTDATA_DIR, 'testdata/config.json') dtd_dir = tempfile.mkdtemp() run_deid_lib.run_pipeline('input_query', None, 'deid_tbl', 'findings_tbl', 'gs://mae-bucket/mae-dir', 'mae_tbl', deid_cfg, 'InspectPhiTask', 'fake-credentials', 'project', testutil.FakeStorageClient, bq_client, None, 'dlp', batch_size=1, dtd_dir=dtd_dir, pipeline_args=None) request_body = {} with open(os.path.join(TESTDATA_DIR, 'testdata/request.json')) as f: request_body = json.load(f) fake_content.deidentify.assert_called_once() _, kwargs = fake_content.deidentify.call_args self.maxDiff = 10000 self.assertEqual(ordered(request_body), ordered(kwargs['body'])) with open(os.path.join(TESTDATA_DIR, 'mae_testdata', 'sample.xml')) as f: contents = f.read() self.assertEqual( testutil.get_gcs_file('mae-bucket/mae-dir/111-1.xml'), contents) self.assertEqual(beam_testutil.get_table('mae_tbl'), [{ 'record_id': '111-1', 'xml': contents }]) with open(os.path.join(TESTDATA_DIR, 'mae_testdata', 'sample.dtd')) as f: contents = f.read() self.assertEqual( testutil.get_gcs_file('mae-bucket/mae-dir/classification.dtd'), contents) with open(os.path.join(dtd_dir, 'classification.dtd')) as local_dtd: self.assertEqual(local_dtd.read(), contents) self.assertEqual(beam_testutil.get_table('deid_tbl'), [{ 'patient_id': '111', 'record_number': '1', 'note': 'deid_resp_val', 'field_transform_col': 'transformed!!' }]) self.assertEqual(beam_testutil.get_table('findings_tbl'), [{ 'patient_id': '111', 'record_number': '1', 'findings': str(findings) }])