def retraction_query_runner(queries):
    query_job_ids = []
    for query_dict in queries:
        logger.debug('Retracting from %s.%s using query %s' %
                     (query_dict[DEST_DATASET], query_dict[DEST_TABLE],
                      query_dict[QUERY]))
        if query_dict[DELETE_FLAG]:
            job_results = bq_utils.query(q=query_dict[QUERY], batch=True)
            rows_affected = job_results['numDmlAffectedRows']
            logger.debug('%s rows deleted from %s.%s' %
                         (rows_affected, query_dict[DEST_DATASET],
                          query_dict[DEST_TABLE]))
        else:
            job_results = bq_utils.query(
                q=query_dict[QUERY],
                destination_table_id=query_dict[DEST_TABLE],
                write_disposition=WRITE_TRUNCATE,
                destination_dataset_id=query_dict[DEST_DATASET],
                batch=True)
        query_job_id = job_results['jobReference']['jobId']
        query_job_ids.append(query_job_id)

    incomplete_jobs = bq_utils.wait_on_jobs(query_job_ids)
    if incomplete_jobs:
        logger.debug('Failed on {count} job ids {ids}'.format(
            count=len(incomplete_jobs), ids=incomplete_jobs))
        logger.debug('Terminating retraction')
        raise bq_utils.BigQueryJobWaitError(incomplete_jobs)
示例#2
0
def create_match_values_table(project, rdr_dataset, destination_dataset):
    """
    Get the desired matching values from the combined observation table.

    This retrieves all possible matching values from the observation table
    used in participant matching.  Returned query data is limited to the
    person_id, observation_concept_id, and the value_as_string fields.

    :param project:  The project name to query
    :param rdr_dataset:  The rdr dataset name to query
    :param destination_dataset:  The dataset to write the result table to

    :return: The name of the interim table
    """
    query_string = consts.ALL_PPI_OBSERVATION_VALUES.format(
        project=project,
        dataset=rdr_dataset,
        table=consts.OBSERVATION_TABLE,
        pii_list=','.join(consts.PII_CODES_LIST))

    LOGGER.info("Participant validation ran the query\n%s", query_string)
    results = bq_utils.query(query_string,
                             destination_dataset_id=destination_dataset,
                             destination_table_id=consts.ID_MATCH_TABLE,
                             write_disposition='WRITE_TRUNCATE',
                             batch=True)

    query_job_id = results['jobReference']['jobId']
    incomplete_jobs = bq_utils.wait_on_jobs([query_job_id])
    if incomplete_jobs != []:
        raise bq_utils.BigQueryJobWaitError(incomplete_jobs)

    return consts.ID_MATCH_TABLE
示例#3
0
def query(q, dst_table_id, dst_dataset_id, write_disposition='WRITE_APPEND'):
    """
    Run query and save results to a table

    :param q: SQL statement
    :param dst_table_id: save results in a table with the specified id
    :param dst_dataset_id: identifies output dataset
    :param write_disposition: WRITE_TRUNCATE, WRITE_EMPTY, or WRITE_APPEND (default, to preserve schema)
    :return: query result
    """
    query_job_result = bq_utils.query(q,
                                      destination_table_id=dst_table_id,
                                      destination_dataset_id=dst_dataset_id,
                                      write_disposition=write_disposition)
    query_job_id = query_job_result['jobReference']['jobId']
    logging.info(f'Job {query_job_id} started for table {dst_table_id}')
    job_status = query_job_result['status']
    error_result = job_status.get('errorResult')
    if error_result is not None:
        msg = f'Job {query_job_id} failed because: {error_result}'
        raise bq_utils.InvalidOperationError(msg)
    incomplete_jobs = bq_utils.wait_on_jobs([query_job_id])
    if len(incomplete_jobs) > 0:
        raise bq_utils.BigQueryJobWaitError(incomplete_jobs)
    return query_job_result
示例#4
0
def query(q, dst_table_id, write_disposition='WRITE_APPEND'):
    """
    Run query and block until job is done
    :param q: SQL statement
    :param dst_table_id: if set, output is saved in a table with the specified id
    :param write_disposition: WRITE_TRUNCATE, WRITE_EMPTY, or WRITE_APPEND (default, to preserve schema)
    """
    dst_dataset_id = bq_utils.get_ehr_rdr_dataset_id()
    query_job_result = bq_utils.query(q, destination_table_id=dst_table_id, write_disposition=write_disposition,
                                      destination_dataset_id=dst_dataset_id)
    query_job_id = query_job_result['jobReference']['jobId']
    incomplete_jobs = bq_utils.wait_on_jobs([query_job_id])
    if len(incomplete_jobs) > 0:
        raise bq_utils.BigQueryJobWaitError(incomplete_jobs)
示例#5
0
def clean_dataset(project=None, dataset=None, statements=None):
    if project is None or project == '' or project.isspace():
        project = app_identity.get_application_id()
        LOGGER.debug('Project name not provided.  Using default.')

    if statements is None:
        statements = []

    failures = 0
    successes = 0
    for statement in statements:
        rule_query = statement.format(project=project, dataset=dataset)

        try:
            LOGGER.info("Running query %s", rule_query)
            results = bq_utils.query(rule_query)
        except (oauth2client.client.HttpAccessTokenRefreshError,
                googleapiclient.errors.HttpError):
            LOGGER.exception("FAILED:  Clean rule not executed:\n%s",
                             rule_query)
            failures += 1
            continue

        LOGGER.info("Executing query %s", rule_query)

        # wait for job to finish
        query_job_id = results['jobReference']['jobId']
        incomplete_jobs = bq_utils.wait_on_jobs([query_job_id])
        if incomplete_jobs != []:
            failures += 1
            raise bq_utils.BigQueryJobWaitError(incomplete_jobs)

        successes += 1

    if successes > 0:
        LOGGER.info("Successfully applied %d clean rules for %s.%s", successes,
                    project, dataset)
    else:
        LOGGER.warning("No clean rules successfully applied to %s.%s", project,
                       dataset)

    if failures > 0:
        print("Failed to apply {} clean rules for {}.{}".format(
            failures, project, dataset))
        LOGGER.warning("Failed to apply %d clean rules for %s.%s", failures,
                       project, dataset)
示例#6
0
def get_list_non_match_participants(client, project_id, validation_dataset_id,
                                    hpo_id):
    """
    This function retrieves a list of non-match participants

    :param client:
    :param project_id: 
    :param validation_dataset_id:
    :param hpo_id: 
    :return: 
    """

    # get the the hpo specific <hpo_id>_identity_match
    identity_match_table = bq_utils.get_table_id(hpo_id, IDENTITY_MATCH)
    result = []
    fq_identity_match_table = f'{project_id}.{validation_dataset_id}.{identity_match_table}'
    if not exist_identity_match(client, fq_identity_match_table):
        return result

    non_match_participants_query = get_non_match_participant_query(
        project_id, validation_dataset_id, identity_match_table)

    try:
        LOGGER.info(
            'Identifying non-match participants in {dataset_id}.{identity_match_table}'
            .format(dataset_id=validation_dataset_id,
                    identity_match_table=identity_match_table))

        results = bq_utils.query(q=non_match_participants_query)

    except (oauth2client.client.HttpAccessTokenRefreshError,
            googleapiclient.errors.HttpError) as exp:

        LOGGER.exception('Could not execute the query \n{query}'.format(
            query=non_match_participants_query))
        raise exp

    # wait for job to finish
    query_job_id = results['jobReference']['jobId']
    incomplete_jobs = bq_utils.wait_on_jobs([query_job_id])
    if incomplete_jobs:
        raise bq_utils.BigQueryJobWaitError(incomplete_jobs)

    # return the person_ids only
    result = [row[PERSON_ID_FIELD] for row in bq_utils.response2rows(results)]
    return result
示例#7
0
def create_drug_route_mappings_table(project_id, route_mapping_dataset_id,
                                     dose_form_routes_table_id,
                                     route_mapping_prefix):
    """
    Creates "drug_route_mappings" table using the query GET_DRUGS_FROM_DOSE_FORM
    
    :param project_id: the project containing the routes dataset
    :param route_mapping_dataset_id: dataset where the dose_form_route mapping table exists
            and where the drug_route mapping table will be created
    :param dose_form_routes_table_id: table_id of the dose_form_routes mapping table
    :param route_mapping_prefix: prefix for the dose_form_routes_mapping_table
    :return: upload metadata and created drug_route_table_id
    """
    if route_mapping_dataset_id is None:
        # Using table created in bq_dataset instead of re-creating in every dataset
        route_mapping_dataset_id = bq_utils.get_dataset_id()

    LOGGER.info("Creating %s.%s", route_mapping_dataset_id,
                DRUG_ROUTES_TABLE_ID)

    # create empty table
    bq_utils.create_table(DRUG_ROUTES_TABLE_ID,
                          DRUG_ROUTE_FIELDS,
                          drop_existing=True,
                          dataset_id=route_mapping_dataset_id)

    drug_routes_populate_query = GET_DRUGS_FROM_DOSE_FORM.format(
        project_id=project_id,
        vocabulary_dataset=common.VOCABULARY_DATASET,
        route_mapping_dataset_id=route_mapping_dataset_id,
        dose_form_route_mapping_table=dose_form_routes_table_id,
        route_mapping_prefix=route_mapping_prefix)
    result = bq_utils.query(q=drug_routes_populate_query,
                            write_disposition='WRITE_TRUNCATE',
                            destination_dataset_id=route_mapping_dataset_id,
                            destination_table_id=DRUG_ROUTES_TABLE_ID,
                            batch=True)
    incomplete_jobs = bq_utils.wait_on_jobs([result['jobReference']['jobId']])
    if incomplete_jobs:
        LOGGER.info('Failed job id {id}'.format(id=incomplete_jobs[0]))
        raise bq_utils.BigQueryJobWaitError(incomplete_jobs)
    LOGGER.info("Created %s.%s", route_mapping_dataset_id,
                DRUG_ROUTES_TABLE_ID)
    return result
示例#8
0
def retraction_query_runner(queries):
    query_job_ids = []
    for query_dict in queries:
        logging.info('Retracting from %s.%s using query %s' %
                     (query_dict[DEST_DATASET], query_dict[DEST_TABLE],
                      query_dict[QUERY]))
        job_results = bq_utils.query(q=query_dict[QUERY], batch=True)
        rows_affected = job_results['numDmlAffectedRows']
        logging.info(
            '%s rows deleted from %s.%s' %
            (rows_affected, query_dict[DEST_DATASET], query_dict[DEST_TABLE]))
        query_job_id = job_results['jobReference']['jobId']
        query_job_ids.append(query_job_id)

    incomplete_jobs = bq_utils.wait_on_jobs(query_job_ids)
    if incomplete_jobs:
        logging.info('Failed on {count} job ids {ids}'.format(
            count=len(incomplete_jobs), ids=incomplete_jobs))
        logging.info('Terminating retraction')
        raise bq_utils.BigQueryJobWaitError(incomplete_jobs)
示例#9
0
def retraction_query_runner(queries):
    query_job_ids = []
    for query_dict in queries:
        LOGGER.info(
            f'Retracting from {query_dict[DEST_DATASET]}.{query_dict[DEST_TABLE]} '
            f'using query {query_dict[QUERY]}')
        job_results = bq_utils.query(q=query_dict[QUERY], batch=True)
        rows_affected = job_results['numDmlAffectedRows']
        LOGGER.info(
            f'{rows_affected} rows deleted from {query_dict[DEST_DATASET]}.{query_dict[DEST_TABLE]}'
        )
        query_job_id = job_results['jobReference']['jobId']
        query_job_ids.append(query_job_id)

    incomplete_jobs = bq_utils.wait_on_jobs(query_job_ids)
    if incomplete_jobs:
        LOGGER.info(
            f'Failed on {len(incomplete_jobs)} job ids {incomplete_jobs}')
        LOGGER.info('Terminating retraction')
        raise bq_utils.BigQueryJobWaitError(incomplete_jobs)
示例#10
0
def run_queries(queries):
    """
    Function that will run the queires to create '_deid_map' tables

    :param queries: list of queries
    """
    query_job_ids = []
    for query in queries:
        logging.info(
            f'Creating or renaming _deid_map table with query: {query}')
        job_results = bq_utils.query(q=query, batch=True)
        logging.info('_deid_map table created.')
        query_job_id = job_results['jobReference']['jobId']
        query_job_ids.append(query_job_id)

    incomplete_jobs = bq_utils.wait_on_jobs(query_job_ids)
    if incomplete_jobs:
        logging.info(
            f'Failed on {len(incomplete_jobs)} job ids {incomplete_jobs}')
        logging.info('Terminating _deid_map creation')
        raise bq_utils.BigQueryJobWaitError(incomplete_jobs)
示例#11
0
def write_to_result_table(project, dataset, site, match_values):
    """
    Append items in match_values to the table generated from site name.

    Attempts to limit the insert query to less than 1MB.

    :param site:  string identifier for the hpo site.
    :param match_values:  dictionary of person_ids and match values for a field
    :param project:  the project BigQuery project name
    :param dataset:  name of the dataset containing the table to append to

    :return: query results value
    :raises:  oauth2client.client.HttpAccessTokenRefreshError,
              googleapiclient.errors.HttpError
    """
    if not match_values:
        LOGGER.info(f"No values to insert for site: {site}")
        return None

    result_table = site + consts.VALIDATION_TABLE_SUFFIX
    bucket = gcs_utils.get_drc_bucket()
    path = dataset + '/intermediate_results/' + site + '.csv'

    field_list = [consts.PERSON_ID_FIELD]
    field_list.extend(consts.VALIDATION_FIELDS)
    field_list.append(consts.ALGORITHM_FIELD)

    results = StringIO()
    field_list_str = ','.join(field_list) + '\n'
    results.write(field_list_str)

    LOGGER.info(f"Generating csv values to write to storage for site: {site}")

    for person_key, person_values in match_values.items():
        str_list = [str(person_key)]
        for field in consts.VALIDATION_FIELDS:
            value = str(person_values.get(field, consts.MISSING))
            str_list.append(value)

        str_list.append(consts.YES)
        val_str = ','.join(str_list)
        results.write(val_str + '\n')

    LOGGER.info(f"Writing csv file to cloud storage for site: {site}")

    # write results
    results.seek(0)
    gcs_utils.upload_object(bucket, path, results)
    results.close()

    LOGGER.info(
        f"Wrote {len(match_values)} items to cloud storage for site: {site}")

    # wait on results to be written

    schema_path = os.path.join(fields_path, 'identity_match.json')

    LOGGER.info(
        f"Beginning load of identity match values from csv into BigQuery "
        "for site: {site}")
    try:
        # load csv file into bigquery
        results = bq_utils.load_csv(schema_path,
                                    'gs://' + bucket + '/' + path,
                                    project,
                                    dataset,
                                    result_table,
                                    write_disposition=consts.WRITE_TRUNCATE)

        # ensure the load job finishes
        query_job_id = results['jobReference']['jobId']
        incomplete_jobs = bq_utils.wait_on_jobs([query_job_id])
        if incomplete_jobs != []:
            raise bq_utils.BigQueryJobWaitError(incomplete_jobs)

    except (oauth2client.client.HttpAccessTokenRefreshError,
            googleapiclient.errors.HttpError):
        LOGGER.exception(
            f"Encountered an exception when loading records from csv for site: {site}"
        )
        raise

    LOGGER.info(f"Loaded match values for site: {site}")

    return results
示例#12
0
def clean_dataset(project=None, statements=None, data_stage=stage.UNSPECIFIED):
    """
       Run the assigned cleaning rules.

       :param project:  the project name
       :param statements:  a list of dictionary objects to run the query
       :param data_stage:  an enum to indicate what stage of the cleaning this is
       """
    if project is None or project == '' or project.isspace():
        project = app_identity.get_application_id()
        LOGGER.info('Project name not provided.  Using default.')

    if statements is None:
        statements = []

    failures = 0
    successes = 0
    for statement in statements:
        rule_query = statement.get(cdr_consts.QUERY, '')
        legacy_sql = statement.get(cdr_consts.LEGACY_SQL, False)
        destination_table = statement.get(cdr_consts.DESTINATION_TABLE, None)
        retry = statement.get(cdr_consts.RETRY_COUNT,
                              bq_consts.BQ_DEFAULT_RETRY_COUNT)
        disposition = statement.get(cdr_consts.DISPOSITION,
                                    bq_consts.WRITE_EMPTY)
        destination_dataset = statement.get(cdr_consts.DESTINATION_DATASET,
                                            None)
        batch = statement.get(cdr_consts.BATCH, None)

        try:
            LOGGER.info("Running query %s", rule_query)
            results = bq_utils.query(
                rule_query,
                use_legacy_sql=legacy_sql,
                destination_table_id=destination_table,
                retry_count=retry,
                write_disposition=disposition,
                destination_dataset_id=destination_dataset,
                batch=batch)

        except (oauth2client.client.HttpAccessTokenRefreshError,
                googleapiclient.errors.HttpError):
            LOGGER.exception("FAILED:  Clean rule not executed:\n%s",
                             rule_query)
            failures += 1
            continue

        LOGGER.info("Executing query %s", rule_query)

        # wait for job to finish
        query_job_id = results['jobReference']['jobId']
        incomplete_jobs = bq_utils.wait_on_jobs([query_job_id])
        if incomplete_jobs != []:
            failures += 1
            raise bq_utils.BigQueryJobWaitError(incomplete_jobs)

        if destination_table is not None:
            updated_rows = results.get("totalRows")
            if updated_rows is not None:
                LOGGER.info("Query returned %d rows for %s.%s", updated_rows,
                            destination_dataset, destination_table)

        successes += 1

    if successes > 0:
        LOGGER.info("Successfully applied %d clean rules for %s.%s", successes,
                    project, data_stage)
    else:
        LOGGER.warning("No clean rules successfully applied to %s.%s", project,
                       data_stage)

    if failures > 0:
        print("Failed to apply {} clean rules for {}.{}".format(
            failures, project, data_stage))
        LOGGER.warning("Failed to apply %d clean rules for %s.%s", failures,
                       project, data_stage)