def clean_ehr_dataset(project=None, dataset=None): if dataset is None or dataset == '' or dataset.isspace(): dataset = bq_utils.get_dataset_id() LOGGER.info('Dataset is unspecified. Using default value of:\t%s', dataset) query_list = _gather_ehr_queries(project, dataset) LOGGER.info("Cleaning ehr_dataset") clean_engine.clean_dataset(project, dataset, query_list)
# Generate column expressions for select col_exprs = [field['name'] for field in fields] cols = ',\n '.join(col_exprs) query = ID_DE_DUP_QUERY.format(columns=cols, project_id=project_id, dataset_id=dataset_id, domain_table=table, table_name=table_name) queries.append(query) return queries if __name__ == '__main__': import argparse import clean_cdr_engine parser = argparse.ArgumentParser( description='Parse project_id and dataset_id', formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument( 'project_id', help='Project associated with the input and output datasets') parser.add_argument('dataset_id', help='Dataset where cleaning rules are to be applied') args = parser.parse_args() if args.dataset_id: query_list = get_id_deduplicate_queries(args.project_id, args.dataset_id) clean_cdr_engine.clean_dataset(args.project_id, args.dataset_id, query_list)