def test_fields_for(self): """ Testing that fields for works as expected with sub-directory structures. """ # preconditions # test actual_fields = resources.fields_for('person') # post conditions person_path = os.path.join(resources.base_path, 'resource_files', 'schemas', 'cdm', 'clinical', 'person.json') with open(person_path, 'r') as fp: expected_fields = json.load(fp) self.assertEqual(actual_fields, expected_fields) # test actual_fields = resources.fields_for('person_ext') person_ext_path = os.path.join(resources.base_path, 'resource_files', 'schemas', 'extension_tables', 'person_ext.json') with open(person_ext_path, 'r') as fp: expected_fields = json.load(fp) self.assertEqual(actual_fields, expected_fields)
def test_get_person_id_validation_queries_deid(self): # pre conditions # test results = validator.get_person_id_validation_queries('foo', 'bar_deid') # post conditions self.assertEqual(len(results), ((len(self.all_tables) * 2) - 1)) existing_and_consenting = validator.EXISTING_AND_VALID_CONSENTING_RECORDS existing_in_person_table = validator.SELECT_EXISTING_PERSON_IDS expected = [] for table in self.mapped_tables: field_names = [ 'entry.' + field['name'] for field in resources.fields_for(table) ] fields = ', '.join(field_names) expected.append({ clean_consts.QUERY: existing_and_consenting.format(project='foo', dataset='bar_deid', mapping_dataset='bar', table=table, fields=fields), clean_consts.DESTINATION_TABLE: table, clean_consts.DESTINATION_DATASET: 'bar_deid', clean_consts.DISPOSITION: bq_consts.WRITE_TRUNCATE, }) for table in self.all_tables: field_names = [ 'entry.' + field['name'] for field in resources.fields_for(table) ] fields = ', '.join(field_names) expected.append({ clean_consts.QUERY: existing_in_person_table.format(project='foo', dataset='bar_deid', table=table, fields=fields), clean_consts.DESTINATION_TABLE: table, clean_consts.DESTINATION_DATASET: 'bar_deid', clean_consts.DISPOSITION: bq_consts.WRITE_TRUNCATE, }) self.assertEqual(expected, results)
def _mapping_table_checks(self): """ Check mapping tables exist, have correct schema, have expected number of records """ where = ( 'WHERE EXISTS ' ' (SELECT 1 FROM `{combined_dataset_id}.{ehr_consent_table_id}` AS c ' ' WHERE t.person_id = c.person_id)').format( combined_dataset_id=self.combined_dataset_id, ehr_consent_table_id=EHR_CONSENT_TABLE_ID) ehr_counts = test_util.get_table_counts(self.ehr_dataset_id, DOMAIN_TABLES, where) rdr_counts = test_util.get_table_counts(self.rdr_dataset_id) combined_counts = test_util.get_table_counts(self.combined_dataset_id) output_tables = combined_counts.keys() expected_counts = dict() expected_diffs = ['observation'] for table in DOMAIN_TABLES: expected_mapping_table = mapping_table_for(table) self.assertIn(expected_mapping_table, output_tables) expected_fields = resources.fields_for(expected_mapping_table) actual_table_info = bq_utils.get_table_info( expected_mapping_table, self.combined_dataset_id) actual_fields = actual_table_info.get('schema', dict()).get('fields', []) actual_fields_norm = map(test_util.normalize_field_payload, actual_fields) self.assertCountEqual(expected_fields, actual_fields_norm) # Count should be sum of EHR and RDR # (except for tables like observation where extra records are created for demographics) if 'person_id' in [ field.get('name', '') for field in resources.fields_for(table) ]: unconsented_ehr_records = self.get_unconsented_ehr_records_count( table) else: unconsented_ehr_records = 0 actual_count = combined_counts[expected_mapping_table] if table in expected_diffs: expected_count = actual_count else: expected_count = (ehr_counts[table] - unconsented_ehr_records) + rdr_counts[table] expected_counts[expected_mapping_table] = expected_count self.assertDictContainsSubset(expected_counts, combined_counts)
def get_query_specs(self, *args, **keyword_args) -> query_spec_list: queries_list = [] for table in self.affected_tables: fields = [field['name'] for field in resources.fields_for(table)] fields_to_replace = get_fields_dict(table, fields) if fields_to_replace: cols = get_modified_columns(fields, fields_to_replace) full_join_expression = get_full_join_expression( self.dataset_id, self.project_id, fields_to_replace) query = dict() query[cdr_consts.QUERY] = FIELD_REPLACE_QUERY.render( columns=cols, table_name=table, dataset=self.dataset_id, project=self.project_id, join_expression=full_join_expression) query[cdr_consts.DESTINATION_TABLE] = table query[cdr_consts.DISPOSITION] = bq_consts.WRITE_TRUNCATE query[cdr_consts.DESTINATION_DATASET] = self.dataset_id queries_list.append(query) return queries_list
def parse_src_concept_id_update_query(project_id, dataset_id, table_name): """ Fill in template query used to generate updated domain table :param project_id: identifies the project containing the dataset :param dataset_id: identifies the dataset containing the OMOP data :param table_name: name of a domain table :return: parsed src_concept_id_update query """ fields = [field['name'] for field in resources.fields_for(table_name)] col_exprs = [] fields_to_replace = { resources.get_domain_id_field(table_name): 'dest_id', resources.get_domain_concept_id(table_name): 'new_concept_id', resources.get_domain_source_concept_id(table_name): 'new_src_concept_id' } for field_name in fields: if field_name in fields_to_replace: col_expr = 'coalesce({replace_field}, {field}) AS {field}'.format( replace_field=fields_to_replace[field_name], field=field_name) else: col_expr = field_name col_exprs.append(col_expr) cols = ', '.join(col_exprs) return SRC_CONCEPT_ID_UPDATE_QUERY.format(cols=cols, project=project_id, dataset=dataset_id, domain_table=table_name, logging_table=SRC_CONCEPT_ID_TABLE_NAME)
def parse_mapping_table_update_query(project_id, dataset_id, table_name, mapping_table_name): """ Fill in mapping tables query so it either gets dest_id from the logging table or the domain table :param project_id: identifies the project containing the dataset :param dataset_id: identifies the dataset containing the OMOP data :param table_name: name of the domain table for which the query needs to be parsed :param mapping_table_name: name of the mapping_table for which the query needs to be parsed :return: """ fields = [field['name'] for field in resources.fields_for(mapping_table_name)] col_exprs = [] for field_name in fields: if field_name == resources.get_domain_id_field(table_name): col_expr = 'coalesce(dest_id, {field}) AS {field}'.format(field=field_name) else: col_expr = field_name col_exprs.append(col_expr) cols = ', '.join(col_exprs) return UPDATE_MAPPING_TABLES_QUERY.format(cols=cols, project=project_id, dataset=dataset_id, mapping_table=mapping_table_name, logging_table=SRC_CONCEPT_ID_TABLE_NAME, domain_table=table_name)
def get_queries(project=None, dataset=None): """ Return a list of queries to remove data for missing persons. Removes data from person_id linked tables for any persons which do not exist in the person table. :return: A list of string queries that can be executed to delete data from other tables for non-person users. """ query_list = [] for table in common.CLINICAL_DATA_TABLES: field_names = [ 'entry.' + field['name'] for field in resources.fields_for(table) ] fields = ', '.join(field_names) delete_query = SELECT_EXISTING_PERSON_IDS.format(project=project, dataset=dataset, table=table, fields=fields) query_list.append({ clean_consts.QUERY: delete_query, clean_consts.DESTINATION_TABLE: table, clean_consts.DESTINATION_DATASET: dataset, clean_consts.DISPOSITION: bq_consts.WRITE_TRUNCATE, }) return query_list
def mapping_query(domain_table): """ Returns query used to get mapping of all records from RDR combined with EHR records of consented participants :param domain_table: one of the domain tables (e.g. 'visit_occurrence', 'condition_occurrence') :return: """ if combine_consts.PERSON_ID in [ field['name'] for field in resources.fields_for(domain_table) ]: return combine_consts.MAPPING_QUERY_WITH_PERSON_CHECK.format( rdr_dataset_id=bq_utils.get_rdr_dataset_id(), ehr_dataset_id=bq_utils.get_dataset_id(), ehr_rdr_dataset_id=bq_utils.get_ehr_rdr_dataset_id(), domain_table=domain_table, mapping_constant=common.RDR_ID_CONSTANT, ehr_consent_table_id=combine_consts.EHR_CONSENT_TABLE_ID) else: return combine_consts.MAPPING_QUERY_WITHOUT_PERSON_CHECK.format( rdr_dataset_id=bq_utils.get_rdr_dataset_id(), ehr_dataset_id=bq_utils.get_dataset_id(), ehr_rdr_dataset_id=bq_utils.get_ehr_rdr_dataset_id(), domain_table=domain_table, mapping_constant=common.RDR_ID_CONSTANT)
def get_table_fields(table, ext_table_id): """ Generates fields for ext tables for the provided cdm table :param table: cdm table to generate ext fields for :param ext_table_id: cdm table extension name. used to load schema defintion if it exists as a json file :return: dict containing ext fields for the cdm table """ table_fields = [] try: table_fields = fields_for(ext_table_id) LOGGER.info( f"using json schema file definition for table: {ext_table_id}") except (RuntimeError): for field in EXT_FIELD_TEMPLATE: table_field = dict() for key in field: table_field[key] = field[key].format(table=table) table_fields.append(table_field) LOGGER.info( f"using dynamic extension table schema for table: {ext_table_id}") return table_fields
def create_filter_list(self, filter_list): """ Create a list of filter expressions. :param filter_list: a list of dictionaries representing the filter expression to be used as part of an SQL WHERE clause :return: a list of strings representing the expressions. """ field_definitions = {} for field_def in fields_for(self.tablename): field_definitions[field_def.get('name')] = field_def string_list = [] for item in filter_list: string, field = create_on_string(item) field_definition = field_definitions.get(field) field_mode = field_definition.get('mode').lower() # if based on a nullable field, make sure to use the exists function if field_mode.lower() == 'nullable': item['qualifier'] = item['qualifier'].upper() string, _ = create_on_string(item) nullable_str = ( ' exists (SELECT * FROM `:idataset.observation` AS record2 ' 'WHERE :join_tablename.observation_id = record2.observation_id ' 'AND {conditional})') string = nullable_str.format(conditional=string) string_list.append(string) return string_list
def get_table_schema(table_name, fields=None): """ A helper function to create big query SchemaFields for dictionary definitions. Given the table name, reads the schema from the schema definition file and returns a list of SchemaField objects that can be used for table creation. :param table_name: the table name to get BigQuery SchemaField information for. :param fields: An optional argument to provide fields/schema as a list of JSON objects :returns: a list of SchemaField objects representing the table's schema. """ if fields: fields = fields else: fields = fields_for(table_name) schema = [] for column in fields: name = column.get('name') field_type = column.get('type') mode = column.get('mode') description = column.get('description') column_def = bigquery.SchemaField(name, field_type, mode, description) schema.append(column_def) return schema
def test_fields_for_duplicate_files(self, mock_walk): """ Testing that fields for works as expected with sub-directory structures. Verifies that if duplicates are detected and no distinction is made as to which one is wanted, an error is raised. Also shows that if duplicate file names exist in separate directories, if the named sub-directory is searched and the file is found, this file is opened and read. """ # preconditions sub_dir = 'baz' # mocks result tuples for os.walk walk_results = [(os.path.join('resource_files', 'schemas'), [sub_dir], ['duplicate.json', 'unique1.json']), (os.path.join('resource_files', 'schemas', sub_dir), [], ['duplicate.json', 'unique2.json'])] mock_walk.return_value = walk_results # test self.assertRaises(RuntimeError, resources.fields_for, 'duplicate') # test data = '[{"id": "fake id desc", "type": "fake type"}]' json_data = json.loads(data) with mock.patch('resources.open', mock.mock_open(read_data=data)) as mock_file: with mock.patch('resources.json.load') as mock_json: mock_json.return_value = json_data actual_fields = resources.fields_for('duplicate', sub_dir) self.assertEqual(actual_fields, json_data)
def test_validate_five_persons_success(self, mock_check_cron): expected_results = [] test_file_names = [ os.path.basename(f) for f in test_util.FIVE_PERSONS_FILES ] for cdm_file in common.SUBMISSION_FILES: if cdm_file in test_file_names: expected_result = (cdm_file, 1, 1, 1) test_file = os.path.join(test_util.FIVE_PERSONS_PATH, cdm_file) test_util.write_cloud_file(self.hpo_bucket, test_file, prefix=self.folder_prefix) else: expected_result = (cdm_file, 0, 0, 0) expected_results.append(expected_result) bucket_items = gcs_utils.list_bucket(self.hpo_bucket) folder_items = main.get_folder_items(bucket_items, self.folder_prefix) r = main.validate_submission(self.hpo_id, self.hpo_bucket, folder_items, self.folder_prefix) self.assertSetEqual(set(r['results']), set(expected_results)) # check tables exist and are clustered as expected for table in resources.CDM_TABLES + common.PII_TABLES: table_id = bq_utils.get_table_id(test_util.FAKE_HPO_ID, table) table_info = bq_utils.get_table_info(table_id) fields = resources.fields_for(table) field_names = [field['name'] for field in fields] if 'person_id' in field_names: self.table_has_clustering(table_info)
def store_participant_data(df, project_id, destination_table): """ Stores the fetched participant data in a BigQuery dataset. If the table doesn't exist, it will create that table. If the table does exist, it will append the data onto that designated table. :param df: pandas dataframe created to hold participant data fetched from ParticipantSummary API :param project_id: identifies the project :param destination_table: name of the table to be written in the form of dataset.tablename :return: returns a dataset with the participant data """ # Parameter check if not isinstance(project_id, str): raise RuntimeError( f'Please specify the project in which to create the tables') table_schema = fields_for(destination_table.split('.')[-1]) return pandas_gbq.to_gbq(df, destination_table, project_id, if_exists="replace", table_schema=table_schema)
def parse_cdr_metadata_args(args=None): fields = resources.fields_for(METADATA_TABLE) parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument( '--component', required=True, help='Job specification for adding data to metadata table', choices=list(JOB_COMPONENTS)) parser.add_argument('--project_id', required=True, help='Identifies the dataset to copy metadata from') parser.add_argument('--target_dataset', default=True, help='Identifies the dataset to copy metadata to') parser.add_argument('--source_dataset', default=None, help='Identifies the dataset to copy metadata from') for field in fields: parser.add_argument(f'--{field[NAME]}', default=None, help=f'{field[DESCRIPTION]}') cdr_metadata_args, unknown_args = parser.parse_known_args(args) custom_args = clean_cdr._get_kwargs(unknown_args) return cdr_metadata_args, custom_args
def get_table_cols_df(client, project_id, dataset_id): """ Returns a df of dataset's INFORMATION_SCHEMA.COLUMNS :param project_id: bq name of project_id :param dataset_id: ba name of dataset_id :param client: bq client object :return: dataframe of columns from INFORMATION_SCHEMA """ table_cols_df = pd.DataFrame() if client: LOGGER.info( f"Getting column information from live dataset: `{dataset_id}`") # if possible, read live table schemas table_cols_query = TABLE_INFORMATION_SCHEMA.render(project=project_id, dataset=dataset_id) table_cols_df = client.query(table_cols_query).to_dataframe() else: # if None is passed to the client, read the table data from JSON schemas # generate a dataframe from schema files LOGGER.info("Getting column information from schema files") table_dict_list = [] for table in FITBIT_TABLES + CDM_TABLES: table_fields = resources.fields_for(table) for field in table_fields: field['table_name'] = table table_dict_list.extend(table_fields) table_cols_df = pd.DataFrame(table_dict_list) table_cols_df = table_cols_df.rename(columns={"name": "column_name"}) return table_cols_df
def get_id_deduplicate_queries(project_id, dataset_id): """ This function gets the queries required to remove the duplicate id columns from a dataset :param project_id: Project name :param dataset_id: Name of the dataset where a rule should be applied :return: a list of queries. """ queries = [] tables_with_primary_key = cdm.tables_to_map() for table in tables_with_primary_key: table_name = table fields = resources.fields_for(table) # Generate column expressions for select col_exprs = [field['name'] for field in fields] cols = ', '.join(col_exprs) query = dict() query[cdr_consts.QUERY] = ID_DE_DUP_QUERY.format(columns=cols, project_id=project_id, dataset_id=dataset_id, domain_table=table, table_name=table_name) query[cdr_consts.DESTINATION_TABLE] = table query[cdr_consts.DISPOSITION] = bq_consts.WRITE_TRUNCATE query[cdr_consts.DESTINATION_DATASET] = dataset_id queries.append(query) return queries
def get_fill_freetext_source_value_fields_queries(project_id, dataset_id): """ Generates queries to replace the source_value_fields with the concept_code. :param project_id: Name of the project where the dataset on which the rules are to be applied on :param dataset_id: Name of the dataset on which the rules are to be applied on :return: A list of queries to be run. """ queries_list = [] for table in resources.CDM_TABLES: fields = [field['name'] for field in resources.fields_for(table)] fields_to_replace = get_fields_dict(table, fields) if fields_to_replace: cols = get_modified_columns(fields, fields_to_replace) full_join_expression = get_full_join_expression( dataset_id, project_id, fields_to_replace) query = dict() query[cdr_consts.QUERY] = FIELD_REPLACE_QUERY.format( columns=cols, table_name=table, dataset=dataset_id, project=project_id, join_expression=full_join_expression) query[cdr_consts.DESTINATION_TABLE] = table query[cdr_consts.DISPOSITION] = bq_consts.WRITE_TRUNCATE query[cdr_consts.DESTINATION_DATASET] = dataset_id queries_list.append(query) return queries_list
def _load_data(self): # Load measurement_concept_sets required_labs.load_measurement_concept_sets_table( project_id=self.project_id, dataset_id=self.dataset_id) # Load measurement_concept_sets_descendants required_labs.load_measurement_concept_sets_descendants_table( project_id=self.project_id, dataset_id=self.dataset_id) # Create concept and concept_ancestor empty tables if not exist if not bq_utils.table_exists(common.CONCEPT, self.dataset_id): bq_utils.create_standard_table(common.CONCEPT, common.CONCEPT) if not bq_utils.table_exists(common.CONCEPT, self.dataset_id): bq_utils.create_standard_table(common.CONCEPT_ANCESTOR, common.CONCEPT_ANCESTOR) # we need to load measurement.csv into bigquery_dataset_id in advance for the other integration tests ehr_measurement_result = bq_utils.load_table_from_csv( project_id=self.project_id, dataset_id=self.dataset_id, table_name=bq_utils.get_table_id(FAKE_HPO_ID, common.MEASUREMENT), csv_path=test_util.FIVE_PERSONS_MEASUREMENT_CSV, fields=resources.fields_for(common.MEASUREMENT)) bq_utils.wait_on_jobs( [ehr_measurement_result['jobReference']['jobId']])
def get_id_deduplicate_queries(project_id, dataset_id): """ This function gets the queries required to remove the duplicate id columns from a dataset :param project_id: Project name :param dataset_id: Name of the dataset where a rule should be applied :return: a list of queries. """ queries = [] tables_with_primary_key = cdm.tables_to_map() for table in tables_with_primary_key: if 'unioned' in dataset_id: table_name = 'unioned_ehr_{table}'.format(table=table) else: table_name = table if bq_utils.table_exists(table_name, dataset_id): fields = resources.fields_for(table) # Generate column expressions for select col_exprs = [field['name'] for field in fields] cols = ',\n '.join(col_exprs) query = ID_DE_DUP_QUERY.format(columns=cols, project_id=project_id, dataset_id=dataset_id, domain_table=table, table_name=table_name) queries.append(query) return queries
def test_measurement_concept_sets_table(self): query = sql_wrangle.qualify_tables( '''SELECT * FROM {dataset_id}.{table_id}'''.format( dataset_id=self.dataset_id, table_id=MEASUREMENT_CONCEPT_SETS_TABLE)) response = bq_utils.query(query) actual_fields = [{ 'name': field['name'].lower(), 'type': field['type'].lower() } for field in response['schema']['fields']] expected_fields = [{ 'name': field['name'].lower(), 'type': field['type'].lower() } for field in resources.fields_for(MEASUREMENT_CONCEPT_SETS_TABLE)] self.assertListEqual(expected_fields, actual_fields) measurement_concept_sets_table_path = os.path.join( resources.resource_path, MEASUREMENT_CONCEPT_SETS_TABLE + '.csv') expected_total_rows = len( resources.csv_to_list(measurement_concept_sets_table_path)) self.assertEqual(expected_total_rows, int(response['totalRows']))
def get_upgrade_table_query(client, dataset_id, table_id, hpo_id=None): """ Generate query for specified tables :param client: BQ Client :param dataset_id: Source dataset :param table_id: Source table :param hpo_id: :return: """ try: source_table = f'{client.project}.{dataset_id}.{table_id}' source_fields = sq.get_source_fields(client, source_table) dst_fields = resources.fields_for( resources.get_base_table_name(table_id, hpo_id)) col_cast_exprs = [ get_field_cast_expr_with_schema_change(field, source_fields) for field in dst_fields ] col_expr = ', '.join(col_cast_exprs) except (OSError, IOError, RuntimeError): # default to select * col_expr = '*' select_all_query = 'SELECT {col_expr} FROM `{project_id}.{dataset_id}.{table_id}`' return select_all_query.format(col_expr=col_expr, project_id=client.project, dataset_id=dataset_id, table_id=table_id)
def get_non_match_participant_query(project_id, validation_dataset_id, identity_match_table): """ This function generates the query for identifying non_match participants query flagged by the DRC match algorithm :param project_id: :param validation_dataset_id: :param identity_match_table: :return: """ # if any of the two of first_name, last_name and birthday are missing, this is a non-match num_of_missing_key_fields = CRITERION_COLUMN_TEMPLATE.format( column_expr=get_missing_criterion(KEY_FIELDS), num_of_missing=NUM_OF_MISSING_KEY_FIELDS) identity_match_fields = [ field['name'] for field in resources.fields_for(IDENTITY_MATCH) if field['name'] not in IDENTITY_MATCH_EXCLUDED_FIELD ] # if the total number of missings is equal to and bigger than 4, this is a non-match num_of_missing_all_fields = CRITERION_COLUMN_TEMPLATE.format( column_expr=get_missing_criterion(identity_match_fields), num_of_missing=NUM_OF_MISSING_ALL_FIELDS) # instantiate the query for identifying the non-match participants in the validation_dataset select_non_match_participants_query = SELECT_NON_MATCH_PARTICIPANTS_QUERY.format( project_id=project_id, validation_dataset_id=validation_dataset_id, identity_match_table=identity_match_table, key_fields_criteria=num_of_missing_key_fields, all_fields_criteria=num_of_missing_all_fields) return select_non_match_participants_query
def null_invalid_foreign_keys(project_id, dataset_id, sandbox_dataset_id=None): """ This method gets the queries required to make invalid foreign keys null :param project_id: Project associated with the input and output datasets :param dataset_id: Dataset where cleaning rules are to be applied :param sandbox_dataset_id: Identifies the sandbox dataset to store rows #TODO use sandbox_dataset_id for CR :return: a list of queries """ queries_list = [] for table in resources.CDM_TABLES: field_names = [field['name'] for field in resources.fields_for(table)] foreign_keys_flags = [] fields_to_join = [] for field_name in field_names: if field_name in FOREIGN_KEYS_FIELDS and field_name != table + '_id': fields_to_join.append(field_name) foreign_keys_flags.append(field_name) if fields_to_join: col_exprs = [] for field in field_names: if field in fields_to_join: if field in foreign_keys_flags: col_expr = '{x}.'.format(x=field[:3]) + field else: col_expr = field col_exprs.append(col_expr) cols = ', '.join(col_exprs) join_expression = [] for key in FOREIGN_KEYS_FIELDS: if key in foreign_keys_flags: if key == 'person_id': table_alias = cdr_consts.PERSON_TABLE_NAME else: table_alias = _mapping_table_for( '{x}'.format(x=key)[:-3]) join_expression.append( LEFT_JOIN.format(dataset_id=dataset_id, prefix=key[:3], field=key, table=table_alias)) full_join_expression = " ".join(join_expression) query = dict() query[cdr_consts.QUERY] = INVALID_FOREIGN_KEY_QUERY.format( cols=cols, table_name=table, dataset_id=dataset_id, project=project_id, join_expr=full_join_expression) query[cdr_consts.DESTINATION_TABLE] = table query[cdr_consts.DISPOSITION] = bq_consts.WRITE_TRUNCATE query[cdr_consts.DESTINATION_DATASET] = dataset_id queries_list.append(query) return queries_list
def test_get_query_spec(self): # Pre conditions self.assertEqual(self.rule_instance.affected_datasets, [cdr_consts.COMBINED]) # Test result_list = self.rule_instance.get_query_specs() # Post conditions expected_query_list = [] expected_sandbox_query_list = [] for table in table_dates: sandbox_query = dict() sandbox_query[cdr_consts.QUERY] = SANDBOX_BAD_END_DATES.render( project_id=self.project_id, sandbox_id=self.sandbox_dataset_id, intermediary_table=self.rule_instance.sandbox_table_for(table), dataset_id=self.dataset_id, table=table, table_end_date=table_dates[table][1], table_start_date=table_dates[table][0]) expected_sandbox_query_list.append(sandbox_query) fields = resources.fields_for(table) col_exprs = [ 'r.' + field['name'] if field['name'] == table_dates[table][1] else 'l.' + field['name'] for field in fields ] cols = ', '.join(col_exprs) query = dict() query[cdr_consts.QUERY] = NULL_BAD_END_DATES.render( project_id=self.project_id, dataset_id=self.dataset_id, cols=cols, table=table, table_start_date=table_dates[table][0], table_end_date=table_dates[table][1]) expected_query_list.append(query) sandbox_query = dict() sandbox_query[cdr_consts.QUERY] = SANDBOX_BAD_END_DATES.render( project_id=self.project_id, sandbox_id=self.sandbox_dataset_id, intermediary_table=self.rule_instance.sandbox_table_for( visit_occurrence), dataset_id=self.dataset_id, table=visit_occurrence, table_end_date=visit_occurrence_dates[visit_occurrence][1], table_start_date=visit_occurrence_dates[visit_occurrence][0]) expected_sandbox_query_list.append(sandbox_query) query = dict() query[cdr_consts.QUERY] = POPULATE_VISIT_END_DATES.render( project_id=self.project_id, dataset_id=self.dataset_id, placeholder_date=placeholder_date) expected_query_list.append(query) expected_list = expected_sandbox_query_list + expected_query_list self.assertEqual(result_list, expected_list)
def main(raw_args=None): """ Execute deid as a single script. Responsible for aggregating the tables deid will execute on and calling deid. """ args = parse_args(raw_args) add_console_logging(args.console_log) known_tables = get_known_tables(fields_path) deid_tables_path = os.path.join(DEID_PATH, 'config', 'ids', 'tables') configured_tables = get_known_tables(deid_tables_path) tables = get_output_tables(args.input_dataset, known_tables, args.skip_tables, args.tables) exceptions = [] successes = [] for table in tables: tablepath = None if table in configured_tables: tablepath = os.path.join(deid_tables_path, table + '.json') else: tablepath = table parameter_list = [ '--rules', os.path.join(DEID_PATH, 'config', 'ids', 'config.json'), '--private_key', args.private_key, '--table', tablepath, '--action', args.action, '--idataset', args.input_dataset, '--log', LOGS_PATH ] if args.interactive_mode: parameter_list.append('--interactive') field_names = [field.get('name') for field in fields_for(table)] if 'person_id' in field_names: parameter_list.append('--cluster') LOGGER.info('Executing deid with:\n\tpython deid/aou.py %s', ' '.join(parameter_list)) try: aou.main(parameter_list) except google.api_core.exceptions.GoogleAPIError: LOGGER.exception("Encountered deid exception:\n") exceptions.append(table) else: LOGGER.info('Successfully executed deid on table: %s', table) successes.append(table) copy_suppressed_table_schemas(known_tables, args.input_dataset + '_deid') LOGGER.info('Deid has finished. Successfully executed on tables: %s', '\n'.join(successes)) for exc in exceptions: LOGGER.error( "Deid encountered exceptions when processing table: %s" ". Fix problems and re-run deid for table if needed.", exc)
def get_query_specs(self, *args, **keyword_args) -> query_spec_list: """ Return a list of dictionary query specifications. :return: A list of dictionaries. Each dictionary contains a single query and a specification for how to execute that query. The specifications are optional but the query is required. """ queries = [] sandbox_queries = [] for table in table_dates: # create sandbox queries sandbox_query = dict() sandbox_query[cdr_consts.QUERY] = SANDBOX_BAD_END_DATES.render( project_id=self.project_id, sandbox_id=self.sandbox_dataset_id, intermediary_table=self.sandbox_table_for(table), dataset_id=self.dataset_id, table=table, table_end_date=table_dates[table][1], table_start_date=table_dates[table][0]) sandbox_queries.append(sandbox_query) fields = resources.fields_for(table) # Generate column expressions for select col_exprs = [ 'r.' + field['name'] if field['name'] == table_dates[table][1] else 'l.' + field['name'] for field in fields ] cols = ', '.join(col_exprs) query = dict() query[cdr_consts.QUERY] = NULL_BAD_END_DATES.render( project_id=self.project_id, dataset_id=self.dataset_id, cols=cols, table=table, table_start_date=table_dates[table][0], table_end_date=table_dates[table][1]) queries.append(query) # Sandbox query for visit_occurrence sandbox_vo_query = dict() sandbox_vo_query[cdr_consts.QUERY] = SANDBOX_BAD_END_DATES.render( project_id=self.project_id, sandbox_id=self.sandbox_dataset_id, intermediary_table=self.sandbox_table_for(visit_occurrence), dataset_id=self.dataset_id, table=visit_occurrence, table_end_date=visit_occurrence_dates[visit_occurrence][1], table_start_date=visit_occurrence_dates[visit_occurrence][0]) sandbox_queries.append(sandbox_vo_query) vo_query = dict() vo_query[cdr_consts.QUERY] = POPULATE_VISIT_END_DATES.render( project_id=self.project_id, dataset_id=self.dataset_id, placeholder_date=placeholder_date) queries.append(vo_query) return sandbox_queries + queries
def _get_field_names(tablename): try: fields = fields_for(tablename) except RuntimeError: additional_fields = [] else: additional_fields = [f.get('name') for f in fields] return additional_fields
def get_table_count_query(dataset_id, table_ids, where): queries = [] for table_id in table_ids: if table_id == '_ehr_consent' or 'person_id' in resources.fields_for( table_id): queries.append(table_count_query(dataset_id, table_id, where)) else: queries.append(table_count_query(dataset_id, table_id, where='')) return queries
def get_field_names(self, table): """ This method gets the list of field names in a table affected by the cleaning rule :param table: single table in the list of affected tables :return: list of field names in a single affected table """ field_names = [field['name'] for field in resources.fields_for(table)] return field_names