示例#1
0
def clean_ehr_dataset(project=None, dataset=None):
    if dataset is None or dataset == '' or dataset.isspace():
        dataset = bq_utils.get_dataset_id()
        LOGGER.info('Dataset is unspecified.  Using default value of:\t%s',
                    dataset)

    query_list = _gather_ehr_queries(project, dataset)

    LOGGER.info("Cleaning ehr_dataset")
    clean_engine.clean_dataset(project, dataset, query_list)
示例#2
0
            # Generate column expressions for select
            col_exprs = [field['name'] for field in fields]
            cols = ',\n        '.join(col_exprs)
            query = ID_DE_DUP_QUERY.format(columns=cols,
                                           project_id=project_id,
                                           dataset_id=dataset_id,
                                           domain_table=table,
                                           table_name=table_name)
            queries.append(query)
    return queries


if __name__ == '__main__':
    import argparse
    import clean_cdr_engine

    parser = argparse.ArgumentParser(
        description='Parse project_id and dataset_id',
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument(
        'project_id',
        help='Project associated with the input and output datasets')
    parser.add_argument('dataset_id',
                        help='Dataset where cleaning rules are to be applied')
    args = parser.parse_args()
    if args.dataset_id:
        query_list = get_id_deduplicate_queries(args.project_id,
                                                args.dataset_id)
        clean_cdr_engine.clean_dataset(args.project_id, args.dataset_id,
                                       query_list)