def testGenerateDtdGcs(self): deid_cfg = os.path.join(TESTDATA_DIR, 'testdata/config.json') deid_cfg_json = run_deid_lib.parse_config_file(deid_cfg) dtd_dir = 'gs://dtd-dir' run_deid_lib.run_pipeline(None, None, None, None, None, None, deid_cfg_json, 'InspectPhiTask', 'project', testutil.FakeStorageClient, None, None, 'dlp', batch_size=1, dtd_dir=dtd_dir, input_csv=None, output_csv=None, timestamp=None, pipeline_args=None) with open(os.path.join(TESTDATA_DIR, 'mae_testdata', 'sample.dtd')) as f: self.assertEqual( testutil.get_gcs_file('dtd-dir/classification.dtd'), f.read())
def testCSV(self, mock_w2t_fn, mock_build_fn): mock_w2t_fn.side_effect = partial(self.make_csv_output) deid_response = { 'item': { 'table': { 'rows': [{ 'values': [{ 'stringValue': 'deid_resp_val' }] }], 'headers': [{ 'name': 'note' }] } } } fake_content = Mock() fake_content.deidentify.return_value = Mock(execute=Mock( return_value=deid_response)) fake_projects = Mock(content=Mock(return_value=fake_content)) fake_dlp = Mock(projects=Mock(return_value=fake_projects)) mock_build_fn.return_value = fake_dlp deid_cfg = os.path.join(TESTDATA_DIR, 'sample_deid_config.json') deid_cfg_json = run_deid_lib.parse_config_file(deid_cfg) input_csv = os.path.join(TESTDATA_DIR, 'testdata/input.csv') run_deid_lib.run_pipeline(None, None, None, None, None, None, deid_cfg_json, 'InspectPhiTask', 'project', testutil.FakeStorageClient, None, None, 'dlp', batch_size=1, dtd_dir=None, input_csv=input_csv, output_csv='output-csv', timestamp=DEID_TIMESTAMP, pipeline_args=None) fake_content.deidentify.assert_called_once() self.assertEqual( testutil.get_gcs_file('output-csv').strip(), '222,1,deid_resp_val,' + TIMESTAMP_STRING)
def main(): logging.getLogger().setLevel(logging.INFO) parser = argparse.ArgumentParser( description='Run Data Loss Prevention (DLP) DeID on Google Cloud.') run_deid_lib.add_all_args(parser) args, pipeline_args = parser.parse_known_args(sys.argv[1:]) var = 'GOOGLE_APPLICATION_CREDENTIALS' if var not in os.environ or not os.environ[var]: raise Exception('You must specify service account credentials in the ' 'GOOGLE_APPLICATION_CREDENTIALS environment variable.') _, default_project = google.auth.default() # Parse --project and re-add it to the pipeline args, swapping it out for the # default if it's not set. project = args.project if not project: project = default_project pipeline_args += ['--project', project] bq_client = bigquery.Client(project) bq_config_fn = None if hasattr(bigquery.job, 'QueryJobConfig'): bq_config_fn = bigquery.job.QueryJobConfig if not args.deid_config_file: raise Exception('Must provide DeID Config.') deid_config_json = run_deid_lib.parse_config_file(args.deid_config_file) timestamp = datetime.utcnow() errors = run_deid_lib.run_pipeline( args.input_query, args.input_table, args.deid_table, args.findings_table, args.mae_dir, args.mae_table, deid_config_json, args.mae_task_name, project, storage.Client, bq_client, bq_config_fn, args.dlp_api_name, args.batch_size, args.dtd_dir, args.input_csv, args.output_csv, timestamp, pipeline_args) if errors: logging.error(errors) return 1 logging.info('Ran DLP API DeID.')
def deidentify(): """run dlp pipeline.""" if flask.request.method == 'GET': jobs, offset = model.get_list(model.DeidJobTable) result = [{ 'id': job['id'], 'name': job['name'], 'originalQuery': job['original_query'], 'deidTable': job['deid_table'], 'status': job['status'], 'logTrace': job['log_trace'], 'timestamp': job['timestamp'], } for job in jobs] return flask.jsonify(jobs=result, offset=offset), 200 try: jsonschema.validate(flask.request.json, deid_schema) except jsonschema.ValidationError: error_msg = 'unable to validate provided payload.' return flask.jsonify(error=400, text=error_msg), 400 job_data = { 'name': flask.request.json['name'], 'timestamp': datetime.utcnow(), } (input_query, input_table, deid_table, findings_table, mae_dir, mae_table, mae_task_name, batch_size, dtd_dir, input_csv, output_csv) = ( None, None, None, None, None, None, None, None, None, None, None) request = flask.request # determine input input_method, input_info = (request.json['inputMethod'], request.json['inputInfo']) if input_method == 'input_table': input_table = input_info try: dataset, table = input_table.split('.') if not verify_bq_table(dataset, table, EXPECTED_CSV_SCHEMA): error_msg = ('input table schema does not match the expected one. ' 'Expecting: {}'.format(', '.join(EXPECTED_CSV_SCHEMA))) return flask.jsonify(error=400, text=error_msg), 400 except exceptions.NotFound: return flask.jsonify(error=400, text='unable to locate input data'), 400 job_data['original_query'] = 'SELECT * FROM {}'.format(input_table) elif input_method == 'input_query': input_query = input_info job_data['original_query'] = input_query try: get_bq_rows(input_query) except exceptions.BadRequest: error_msg = 'invalid input query' return flask.jsonify(error=400, text=error_msg), 400 elif input_method == 'input_csv': input_csv = input_info else: error_msg = 'wrong input method provided' return flask.jsonify(error=400, text=error_msg), 400 # Determine output output_method, output_info = (request.json['outputMethod'], request.json['outputInfo']) job_data['deid_table'] = output_info if output_method == 'deid_table': deid_table = output_info dataset, table = deid_table.split('.') try: if not verify_bq_table(dataset, table, EXPECTED_OUTPUT_SCHEMA): error_msg = ('output table schema does not match the expected one. ' 'Expecting: {}'.format(', '.join(EXPECTED_OUTPUT_SCHEMA))) return flask.jsonify(error=400, text=error_msg), 400 except exceptions.NotFound: # if table not found, a new one will be created pass elif output_method == 'output_csv': output_csv = output_info else: error_msg = 'wrong output method provided' return flask.jsonify(error=400, text=error_msg), 400 deid_config_json = run_deid_lib.parse_config_file( app.config['DEID_CONFIG_FILE']) findings_table = request.json.get('findingsTable') job_data['findings_table'] = findings_table try: dataset, table = findings_table.split('.') if not verify_bq_table(dataset, table, EXPECTED_FINDINGS_SCHEMA): error_msg = ('findings table schema does not match the expected one. ' 'Expecting: {}'.format(', '.join(EXPECTED_FINDINGS_SCHEMA))) return flask.jsonify(error=400, text=error_msg), 400 except exceptions.NotFound: # if table not found, a new one will be created pass mae_table = request.json.get('maeTable') mae_dir = request.json.get('maeDir') batch_size = request.json.get('batchSize') or 1 pipeline_args = ['--project', app.config['PROJECT_ID']] deid_job = model.create(model.DeidJobTable, job_data) errors = run_deid_lib.run_pipeline( input_query, input_table, deid_table, findings_table, mae_dir, mae_table, deid_config_json, mae_task_name, app.config['PROJECT_ID'], storage.Client, bq_client, bigquery.job.QueryJobConfig, app.config['DLP_API_NAME'], batch_size, dtd_dir, input_csv, output_csv, deid_job.timestamp, pipeline_args) if errors: deid_job.update(status=400, log_trace=errors) return flask.jsonify(error=400, text=errors), 400 deid_job.update(status=200) return flask.jsonify(result='success'), 200
def testReBatchDeid(self, mock_bq_source_fn, mock_bq_sink_fn, mock_build_fn): mock_bq_sink_fn.side_effect = partial(self.make_sink, _TABLE_TO_SCHEMA) mock_bq_source_fn.return_value = beam_testutil.FakeSource() mock_bq_source_fn.return_value._records = [{ 'first_name': 'Boaty', 'last_name': 'McBoatface', 'note': 'text and PID and MORE PID', 'patient_id': '111', 'record_number': '1' }, { 'first_name': 'Zephod', 'last_name': 'Beeblebrox', 'note': 'note2 text', 'patient_id': '222', 'record_number': '2' }] deid_response1 = { 'item': { 'table': { 'rows': [{ 'values': [ sval('Boaty'), sval('McBoatface'), sval('note1 redacted'), sval('111'), sval('1') ] }], 'headers': DEID_HEADERS } } } deid_response2 = { 'item': { 'table': { 'rows': [{ 'values': [ sval('Zephod'), sval('Beeblebrox'), sval('note2 redacted'), sval('222'), sval('2') ] }], 'headers': DEID_HEADERS } } } empty_locations = [{'recordLocation': {'tableLocation': {}}}] findings1 = { 'findings': [{ 'location': { 'codepointRange': { 'start': '9', 'end': '12' }, 'contentLocations': empty_locations }, 'infoType': { 'name': 'PHONE_NUMBER' } }] } findings2 = { 'findings': [{ 'location': { 'codepointRange': { 'start': '17', 'end': '25' }, 'contentLocations': empty_locations }, 'infoType': { 'name': 'US_MALE_NAME' } }] } inspect_response_truncated = {'result': {'findingsTruncated': 'True'}} inspect_responses = [ inspect_response_truncated, { 'result': findings1 }, { 'result': findings2 } ] def inspect_execute(): response = inspect_responses[inspect_execute.call_count] inspect_execute.call_count += 1 return response inspect_execute.call_count = 0 fake_content = Mock() fake_content.inspect.return_value = Mock(execute=inspect_execute) deid_responses = ['Exception', deid_response1, deid_response2] def deid_execute(): response = deid_responses[deid_execute.call_count] deid_execute.call_count += 1 if response == 'Exception': content = ( '{"error": {"message": "Too many findings to de-identify. ' 'Retry with a smaller request."}}').encode('utf-8') raise errors.HttpError(httplib2.Response({'status': 400}), content) return response deid_execute.call_count = 0 fake_content.deidentify.return_value = Mock(execute=deid_execute) fake_projects = Mock(content=Mock(return_value=fake_content)) fake_dlp = Mock(projects=Mock(return_value=fake_projects)) mock_build_fn.return_value = fake_dlp query_job = Mock() rows = [[ 'Boaty', 'McBoatface', 'text and PID and MORE PID', '111', '1', DEID_TIMESTAMP ], ['Zephod', 'Beeblebrox', 'note2 text', '222', '2', DEID_TIMESTAMP]] results_table = FakeBqResults(bq_schema(), rows) query_job.destination.fetch_data.return_value = results_table bq_client = Mock() bq_client.run_async_query.return_value = query_job deid_cfg_file = os.path.join(TESTDATA_DIR, 'testdata/batch_config.json') deid_cfg_json = run_deid_lib.parse_config_file(deid_cfg_file) run_deid_lib.run_pipeline('input_query', None, 'deid_tbl', 'findings_tbl', 'gs://mae-bucket/mae-dir', 'mae_tbl', deid_cfg_json, 'InspectPhiTask', 'project', testutil.FakeStorageClient, bq_client, None, 'dlp', batch_size=2, dtd_dir=None, input_csv=None, output_csv=None, timestamp=DEID_TIMESTAMP, pipeline_args=None) expected_request_body = {} with open(os.path.join(TESTDATA_DIR, 'testdata/batch_request.json')) as f: expected_request_body = json.load(f) fake_content.deidentify.assert_called() _, kwargs = fake_content.deidentify.call_args_list[0] self.maxDiff = 10000 self.assertEqual(ordered(expected_request_body), ordered(kwargs['body'])) self.assertEqual(beam_testutil.get_table('deid_tbl'), EXPECTED_DEID_RESULT) self.assertEqual(EXPECTED_MAE1, testutil.get_gcs_file('mae-bucket/mae-dir/111-1.xml')) self.assertEqual(EXPECTED_MAE2, testutil.get_gcs_file('mae-bucket/mae-dir/222-2.xml'))
def testBatchDeid(self, mock_bq_source_fn, mock_bq_sink_fn, mock_build_fn): mock_bq_sink_fn.side_effect = partial(self.make_sink, _TABLE_TO_SCHEMA) mock_bq_source_fn.return_value = beam_testutil.FakeSource() mock_bq_source_fn.return_value._records = [{ 'first_name': 'Boaty', 'last_name': 'McBoatface', 'note': 'text and PID and MORE PID', 'patient_id': '111', 'record_number': '1' }, { 'first_name': 'Zephod', 'last_name': 'Beeblebrox', 'note': 'note2 text', 'patient_id': '222', 'record_number': '2' }] deid_response = { 'item': { 'table': { 'rows': [{ 'values': [ sval('Boaty'), sval('McBoatface'), sval('note1 redacted'), sval('111'), sval('1') ] }, { 'values': [ sval('Zephod'), sval('Beeblebrox'), sval('note2 redacted'), sval('222'), sval('2') ] }], 'headers': DEID_HEADERS } } } findings = { 'findings': [{ 'location': { 'codepointRange': { 'start': '9', 'end': '12' }, 'contentLocations': [{ 'recordLocation': { 'tableLocation': { 'rowIndex': '0' } } }] }, 'infoType': { 'name': 'PHONE_NUMBER' } }, { 'location': { 'codepointRange': { 'start': '17', 'end': '25' }, 'contentLocations': [{ 'recordLocation': { 'tableLocation': { 'rowIndex': '1' } } }] }, 'infoType': { 'name': 'US_MALE_NAME' } }] } inspect_response = {'result': findings} fake_content = Mock() fake_content.inspect.return_value = Mock(execute=Mock( return_value=inspect_response)) fake_content.deidentify.return_value = Mock(execute=Mock( return_value=deid_response)) fake_projects = Mock(content=Mock(return_value=fake_content)) fake_dlp = Mock(projects=Mock(return_value=fake_projects)) mock_build_fn.return_value = fake_dlp query_job = Mock() rows = [[ 'Boaty', 'McBoatface', 'text and PID and MORE PID', '111', '1', DEID_TIMESTAMP ], ['Zephod', 'Beeblebrox', 'note2 text', '222', '2', DEID_TIMESTAMP]] results_table = FakeBqResults(bq_schema(), rows) query_job.destination.fetch_data.return_value = results_table bq_client = Mock() bq_client.run_async_query.return_value = query_job deid_cfg_file = os.path.join(TESTDATA_DIR, 'testdata/batch_config.json') deid_cfg_json = run_deid_lib.parse_config_file(deid_cfg_file) run_deid_lib.run_pipeline('input_query', None, 'deid_tbl', 'findings_tbl', 'gs://mae-bucket/mae-dir', 'mae_tbl', deid_cfg_json, 'InspectPhiTask', 'project', testutil.FakeStorageClient, bq_client, None, 'dlp', batch_size=2, dtd_dir=None, input_csv=None, output_csv=None, timestamp=DEID_TIMESTAMP, pipeline_args=None) expected_request_body = {} with open(os.path.join(TESTDATA_DIR, 'testdata/batch_request.json')) as f: expected_request_body = json.load(f) fake_content.deidentify.assert_called_once() _, kwargs = fake_content.deidentify.call_args self.maxDiff = 10000 self.assertEqual(ordered(expected_request_body), ordered(kwargs['body'])) self.assertEqual(beam_testutil.get_table('deid_tbl'), EXPECTED_DEID_RESULT) self.assertEqual(EXPECTED_MAE1, testutil.get_gcs_file('mae-bucket/mae-dir/111-1.xml')) self.assertEqual(EXPECTED_MAE2, testutil.get_gcs_file('mae-bucket/mae-dir/222-2.xml'))
def testMultiColumnDeid(self, mock_bq_source_fn, mock_bq_sink_fn, mock_build_fn): table_to_schema = _TABLE_TO_SCHEMA.copy() table_to_schema['deid_tbl'] += ', last_name:STRING' mock_bq_sink_fn.side_effect = partial(self.make_sink, table_to_schema) mock_bq_source_fn.return_value = beam_testutil.FakeSource() mock_bq_source_fn.return_value._records = [{ 'first_name': 'Boaty', 'last_name': 'McBoatface', 'note': 'text and PID and MORE PID', 'patient_id': '111', 'record_number': '1' }] deid_response = { 'item': { 'table': { 'rows': [{ 'values': [{ 'stringValue': 'deidtext' }, { 'stringValue': 'myname' }] }], 'headers': [{ 'name': 'note' }, { 'name': 'last_name' }] } } } empty_locations = [{'recordLocation': {'tableLocation': {}}}] findings = { 'findings': [{ 'location': { 'codepointRange': { 'start': '17', 'end': '25' }, 'contentLocations': empty_locations }, 'infoType': { 'name': 'PHONE_NUMBER' } }, { 'location': { 'codepointRange': { 'start': '9', 'end': '12' }, 'contentLocations': empty_locations }, 'infoType': { 'name': 'US_CENSUS_NAME' } }, { 'location': { 'codepointRange': { 'start': '9', 'end': '12' }, 'contentLocations': [{ 'recordLocation': { 'tableLocation': { 'rowIndex': '0' } } }] }, 'infoType': { 'name': 'US_MALE_NAME' } }] } inspect_response = {'result': findings} fake_content = Mock() fake_content.inspect.return_value = Mock(execute=Mock( return_value=inspect_response)) fake_content.deidentify.return_value = Mock(execute=Mock( return_value=deid_response)) fake_projects = Mock(content=Mock(return_value=fake_content)) fake_dlp = Mock(projects=Mock(return_value=fake_projects)) mock_build_fn.return_value = fake_dlp query_job = Mock() rows = [[ 'Boaty', 'McBoatface', 'note', 'id', 'recordnum', DEID_TIMESTAMP ]] results_table = FakeBqResults(bq_schema(), rows) query_job.destination.fetch_data.return_value = results_table bq_client = Mock() bq_client.run_async_query.return_value = query_job deid_cfg_file = os.path.join(TESTDATA_DIR, 'testdata/multi_column_config.json') deid_cfg_json = run_deid_lib.parse_config_file(deid_cfg_file) mae_dir = '' # Not compatible with multi-column. mae_table = '' # Not compatible with multi-column. run_deid_lib.run_pipeline('input_query', None, 'deid_tbl', 'findings_tbl', mae_dir, mae_table, deid_cfg_json, 'InspectPhiTask', 'project', testutil.FakeStorageClient, bq_client, None, 'dlp', batch_size=1, dtd_dir=None, input_csv=None, output_csv=None, timestamp=DEID_TIMESTAMP, pipeline_args=None) request_body = {} with open( os.path.join(TESTDATA_DIR, 'testdata/multi_column_request.json')) as f: request_body = json.load(f) fake_content.deidentify.assert_called_once() _, kwargs = fake_content.deidentify.call_args self.maxDiff = 10000 self.assertEqual(ordered(request_body), ordered(kwargs['body'])) self.assertEqual(beam_testutil.get_table('deid_tbl'), [{ 'patient_id': '111', 'record_number': '1', 'note': 'deidtext', 'last_name': 'myname', run_deid_lib.DLP_DEID_TIMESTAMP: TIMESTAMP_STRING }])
def testE2E(self, mock_bq_source_fn, mock_bq_sink_fn, mock_build_fn): table_to_schema = _TABLE_TO_SCHEMA.copy() table_to_schema['deid_tbl'] += ', field_transform_col:STRING' mock_bq_sink_fn.side_effect = partial(self.make_sink, table_to_schema) mock_bq_source_fn.return_value = beam_testutil.FakeSource() mock_bq_source_fn.return_value._records = [{ 'first_name': 'Boaty', 'last_name': 'McBoatface', 'note': 'text and PID and MORE PID', 'patient_id': '111', 'record_number': '1', 'field_transform_col': 'transform me!' }] deid_response = { 'item': { 'table': { 'rows': [{ 'values': [{ 'stringValue': 'deid_resp_val' }, { 'stringValue': 'transformed!!' }] }], 'headers': [{ 'name': 'note' }, { 'name': 'field_transform_col' }] } } } empty_locations = [{'recordLocation': {'tableLocation': {}}}] findings = { 'findings': [{ 'location': { 'codepointRange': { 'start': '17', 'end': '25' }, 'contentLocations': empty_locations }, 'infoType': { 'name': 'PHONE_NUMBER' } }, { 'location': { 'codepointRange': { 'start': '9', 'end': '12' }, 'contentLocations': empty_locations }, 'infoType': { 'name': 'US_CENSUS_NAME' } }, { 'location': { 'codepointRange': { 'start': '9', 'end': '12' }, 'contentLocations': empty_locations }, 'infoType': { 'name': 'US_MALE_NAME' } }] } inspect_response = {'result': findings} fake_content = Mock() fake_content.inspect.return_value = Mock(execute=Mock( return_value=inspect_response)) fake_content.deidentify.return_value = Mock(execute=Mock( return_value=deid_response)) fake_projects = Mock(content=Mock(return_value=fake_content)) fake_dlp = Mock(projects=Mock(return_value=fake_projects)) mock_build_fn.return_value = fake_dlp query_job = Mock() rows = [[ 'Boaty', 'McBoatface', 'note', 'id', 'recordnum', DEID_TIMESTAMP ]] results_table = FakeBqResults(bq_schema(), rows) query_job.destination.fetch_data.return_value = results_table bq_client = Mock() bq_client.run_async_query.return_value = query_job deid_cfg = os.path.join(TESTDATA_DIR, 'testdata/config.json') deid_cfg_json = run_deid_lib.parse_config_file(deid_cfg) dtd_dir = tempfile.mkdtemp() run_deid_lib.run_pipeline('input_query', None, 'deid_tbl', 'findings_tbl', 'gs://mae-bucket/mae-dir', 'mae_tbl', deid_cfg_json, 'InspectPhiTask', 'project', testutil.FakeStorageClient, bq_client, None, 'dlp', batch_size=1, dtd_dir=dtd_dir, input_csv=None, output_csv=None, timestamp=DEID_TIMESTAMP, pipeline_args=None) request_body = {} with open(os.path.join(TESTDATA_DIR, 'testdata/request.json')) as f: request_body = json.load(f) fake_content.deidentify.assert_called_once() _, kwargs = fake_content.deidentify.call_args self.maxDiff = 10000 self.assertEqual(ordered(request_body), ordered(kwargs['body'])) with open(os.path.join(TESTDATA_DIR, 'mae_testdata', 'sample.xml')) as f: contents = f.read() self.assertEqual( testutil.get_gcs_file('mae-bucket/mae-dir/111-1.xml'), contents) self.assertEqual(beam_testutil.get_table('mae_tbl'), [{ 'record_id': '111-1', 'xml': contents }]) with open(os.path.join(TESTDATA_DIR, 'mae_testdata', 'sample.dtd')) as f: contents = f.read() self.assertEqual( testutil.get_gcs_file('mae-bucket/mae-dir/classification.dtd'), contents) with open(os.path.join(dtd_dir, 'classification.dtd')) as local_dtd: self.assertEqual(local_dtd.read(), contents) self.assertEqual(beam_testutil.get_table('deid_tbl'), [{ 'patient_id': '111', 'record_number': '1', 'note': 'deid_resp_val', 'field_transform_col': 'transformed!!', run_deid_lib.DLP_DEID_TIMESTAMP: TIMESTAMP_STRING }]) self.assertEqual( beam_testutil.get_table('findings_tbl'), [{ 'patient_id': '111', 'record_number': '1', 'findings': json.dumps(findings), run_deid_lib.DLP_FINDINGS_TIMESTAMP: TIMESTAMP_STRING }])