Пример #1
0
def parse_reroute_domain_query(project_id, dataset_id, dest_table):
    """
    This function generates a query that reroutes the records from all domain tables for the given dest_table.
    It uses _mapping_alignment_table to determine in which domain table the records should land.

    :param project_id: the project_id in which the query is run
    :param dataset_id: the dataset_id in which the query is run
    :param dest_table: the destination CDM table for rerouting
    :return: a query that reroutes the records from all domain tables for the given dest_table
    """
    union_query = EMPTY_STRING

    for src_table in domain_mapping.DOMAIN_TABLE_NAMES:
        if src_table == dest_table or domain_mapping.exist_domain_mappings(src_table, dest_table):

            src_domain_id_field = get_domain_id_field(src_table)
            dest_domain_id_field = get_domain_id_field(dest_table)
            field_mapping_expr = resolve_field_mappings(src_table, dest_table)

            if union_query != EMPTY_STRING:
                union_query += UNION_ALL

            union_query += REROUTE_DOMAIN_RECORD_QUERY.format(project_id=project_id,
                                                              dataset_id=dataset_id,
                                                              src_table=src_table,
                                                              dest_table=dest_table,
                                                              src_domain_id_field=src_domain_id_field,
                                                              dest_domain_id_field=dest_domain_id_field,
                                                              field_mapping_expr=field_mapping_expr)
    return union_query
Пример #2
0
def parse_domain_mapping_query_cross_domain(project_id, dataset_id,
                                            dest_table):
    """
    This function creates a query that generates id mappings in _logging_domain_alignment
    for the rerouting records for dest_table

    :param project_id: the project_id in which the query is run
    :param dataset_id: the dataset_id in which the query is run
    :param dest_table: the destination table to which the records are rerouted
    :return: the query that generates id mappings for the rerouting records
    """
    union_query = EMPTY_STRING

    domain = resources.get_domain(dest_table)
    dest_id_field = resources.get_domain_id_field(dest_table)

    for src_table in domain_mapping.DOMAIN_TABLE_NAMES:

        if src_table != dest_table and domain_mapping.exist_domain_mappings(
                src_table, dest_table):

            src_id_field = resources.get_domain_id_field(src_table)
            domain_concept_id = resources.get_domain_concept_id(src_table)

            if union_query != EMPTY_STRING:
                union_query += UNION_ALL

            union_query += DOMAIN_REROUTE_INCLUDED_INNER_QUERY.render(
                project_id=project_id,
                dataset_id=dataset_id,
                src_table=src_table,
                dest_table=dest_table,
                src_id=src_id_field,
                dest_id=NULL_VALUE,
                domain_concept_id=domain_concept_id,
                domain='\'{}\''.format(domain))

            criteria = domain_mapping.get_rerouting_criteria(
                src_table, dest_table)

            if criteria != EMPTY_STRING:
                union_query += AND + criteria

    output_query = EMPTY_STRING

    if union_query != EMPTY_STRING:
        # the query to get the max id for the dest table
        domain_query = MAXIMUM_DOMAIN_ID_QUERY.render(
            project_id=project_id,
            dataset_id=dataset_id,
            domain_table=dest_table,
            domain_id_field=dest_id_field)

        output_query = DOMAIN_MAPPING_OUTER_QUERY.render(
            union_query=union_query, domain_query=domain_query)
    return output_query
Пример #3
0
def parse_domain_mapping_query_for_excluded_records(project_id, dataset_id):
    """
    This function generates a query that generates id mappings in _logging_domain_alignment for the records
    that will get dropped during rerouting because those records either fail the rerouting criteria or rerouting
    is not possible between src_table and dest_table such as condition_occurrence -> measurement

    :param project_id: the project_id in which the query is run
    :param dataset_id: the dataset_id in which the query is run
    :return: a query that generates id mappings for the records that will get dropped
    """
    union_query = EMPTY_STRING

    for domain_table in domain_mapping.DOMAIN_TABLE_NAMES:

        domain_id_field = get_domain_id_field(domain_table)

        if union_query != EMPTY_STRING:
            union_query += UNION_ALL

        union_query += DOMAIN_REROUTE_EXCLUDED_INNER_QUERY.render(
            project_id=project_id,
            dataset_id=dataset_id,
            src_table=domain_table,
            src_id=domain_id_field,
            src_domain_id_field=domain_id_field)
    return union_query
Пример #4
0
def parse_domain_mapping_query_for_same_domains(project_id, dataset_id):
    """
    This function generates a query that generates id mappings in _logging_domain_alignment for
    the records being copied to the same domain table

    :param project_id: the project_id in which the query is run
    :param dataset_id: the dataset_id in which the query is run
    :return: a query that generates id mappings for the records that will get copied over to the same domain
    """
    union_query = EMPTY_STRING

    for domain_table in domain_mapping.DOMAIN_TABLE_NAMES:

        domain = resources.get_domain(domain_table)
        domain_id_field = resources.get_domain_id_field(domain_table)
        domain_concept_id = resources.get_domain_concept_id(domain_table)

        if union_query != EMPTY_STRING:
            union_query += UNION_ALL

        union_query += DOMAIN_REROUTE_INCLUDED_INNER_QUERY.render(
            project_id=project_id,
            dataset_id=dataset_id,
            src_table=domain_table,
            dest_table=domain_table,
            src_id=domain_id_field,
            dest_id=domain_id_field,
            domain_concept_id=domain_concept_id,
            domain='\'{}\''.format('\',\''.join([domain, METADATA_DOMAIN])))
    return union_query
def parse_src_concept_id_update_query(project_id, dataset_id, table_name):
    """
    Fill in template query used to generate updated domain table

    :param project_id: identifies the project containing the dataset
    :param dataset_id: identifies the dataset containing the OMOP data
    :param table_name: name of a domain table
    :return: parsed src_concept_id_update query
    """
    fields = [field['name'] for field in resources.fields_for(table_name)]
    col_exprs = []
    fields_to_replace = {
        resources.get_domain_id_field(table_name): 'dest_id',
        resources.get_domain_concept_id(table_name): 'new_concept_id',
        resources.get_domain_source_concept_id(table_name): 'new_src_concept_id'
    }
    for field_name in fields:
        if field_name in fields_to_replace:
            col_expr = 'coalesce({replace_field}, {field}) AS {field}'.format(
                replace_field=fields_to_replace[field_name],
                field=field_name)
        else:
            col_expr = field_name
        col_exprs.append(col_expr)
    cols = ', '.join(col_exprs)

    return SRC_CONCEPT_ID_UPDATE_QUERY.format(cols=cols,
                                              project=project_id,
                                              dataset=dataset_id,
                                              domain_table=table_name,
                                              logging_table=SRC_CONCEPT_ID_TABLE_NAME)
def parse_mapping_table_update_query(project_id, dataset_id, table_name, mapping_table_name):
    """

    Fill in mapping tables query so it either gets dest_id from the logging table or the domain table

    :param project_id: identifies the project containing the dataset
    :param dataset_id: identifies the dataset containing the OMOP data
    :param table_name: name of the domain table for which the query needs to be parsed
    :param mapping_table_name: name of the mapping_table for which the query needs to be parsed
    :return:
    """
    fields = [field['name'] for field in resources.fields_for(mapping_table_name)]
    col_exprs = []
    for field_name in fields:
        if field_name == resources.get_domain_id_field(table_name):
            col_expr = 'coalesce(dest_id, {field}) AS {field}'.format(field=field_name)
        else:
            col_expr = field_name
        col_exprs.append(col_expr)
    cols = ', '.join(col_exprs)
    return UPDATE_MAPPING_TABLES_QUERY.format(cols=cols,
                                              project=project_id,
                                              dataset=dataset_id,
                                              mapping_table=mapping_table_name,
                                              logging_table=SRC_CONCEPT_ID_TABLE_NAME,
                                              domain_table=table_name)
Пример #7
0
def get_domain_fields(_domain_table):
    """
    This function retrieves all field names of a CDM table except for the id column such as condition_occurrence_id
    :param _domain_table:
    :return:
    """
    id_field = get_domain_id_field(_domain_table)
    fields = CDM_TABLE_SCHEMAS[_domain_table]
    return [field[NAME_FIELD] for field in fields if field[NAME_FIELD] != id_field]
Пример #8
0
def parse_reroute_domain_query(project_id, dataset_id, dest_table):
    """
    This function generates a query that reroutes the records from all domain tables for the given dest_table.
    It uses _mapping_alignment_table to determine in which domain table the records should land.

    :param project_id: the project_id in which the query is run
    :param dataset_id: the dataset_id in which the query is run
    :param dest_table: the destination CDM table for rerouting
    :return: a query that reroutes the records from all domain tables for the given dest_table
    """
    union_queries = []

    for src_table in domain_mapping.DOMAIN_TABLE_NAMES:
        src_domain_id_field = get_domain_id_field(src_table)
        dest_domain_id_field = get_domain_id_field(dest_table)
        field_mapping_expr = resolve_field_mappings(src_table, dest_table)

        if src_table == dest_table:
            # We are doing this to make sure the schema doesn't change and also keep all the
            # records in the domain table for later rerouting to the other domains
            union_queries.append(
                SELECT_DOMAIN_RECORD_QUERY.render(
                    project_id=project_id,
                    dataset_id=dataset_id,
                    dest_table=dest_table,
                    field_mapping_expr=field_mapping_expr,
                    dest_domain_id_field=dest_domain_id_field))
        elif domain_mapping.exist_domain_mappings(src_table, dest_table):
            # We are only rerouting the records between domain tables that are not the same
            union_queries.append(
                REROUTE_DOMAIN_RECORD_QUERY.render(
                    project_id=project_id,
                    dataset_id=dataset_id,
                    src_table=src_table,
                    dest_table=dest_table,
                    src_domain_id_field=src_domain_id_field,
                    dest_domain_id_field=dest_domain_id_field,
                    field_mapping_expr=field_mapping_expr))

    return UNION_ALL.join(union_queries)
Пример #9
0
def create_domain_field_dict():
    """
    This function categorizes the CDM table fields and puts them into different 'buckets' of the dictionary.
    The purpose of creating this dictionary is to facilitate the mapping of the fields in the downstream process.
    person_id

    :return: a dictionary that contains CDM table fields
    """
    domain_fields = OrderedDict()
    for domain_table in domain_mapping.DOMAIN_TABLE_NAMES:
        _field_mappings = OrderedDict()
        common_field_mappings = OrderedDict()
        date_field_mappings = OrderedDict()
        specific_fields = []
        domain = get_domain(domain_table)
        domain_id_field = get_domain_id_field(domain_table)

        for field_name in [
                field_name for field_name in get_domain_fields(domain_table)
                if field_name != domain_id_field
        ]:

            # Added a special check for drug_exposure because the drug_exposure columns don't follow the same pattern
            # E.g. drug_exposure_start_time doesn't follow the pattern {domain}_start_datetime
            if field_name.find(domain_table) != -1:
                field_suffix = re.sub(domain_table, '', field_name)
            else:
                field_suffix = re.sub(domain.lower(), '', field_name)

            # Put different types of fields into dictionary
            if field_suffix in COMMON_DOMAIN_FIELD_SUFFIXES:
                common_field_mappings[field_suffix] = field_name
            elif field_suffix in DATE_FIELD_SUFFIXES:
                date_field_mappings[field_suffix] = field_name
            elif field_name in COMMON_DOMAIN_FIELD_SUFFIXES:
                common_field_mappings[field_name] = field_name
            elif field_name != domain_id_field:
                specific_fields.append(field_name)

        _field_mappings[DOMAIN_COMMON_FIELDS] = common_field_mappings
        _field_mappings[DOMAIN_SPECIFIC_FIELDS] = specific_fields
        _field_mappings[DOMAIN_DATE_FIELDS] = date_field_mappings
        domain_fields[domain_table] = _field_mappings

    return domain_fields
Пример #10
0
def get_queries(project=None, dataset=None, sandbox_dataset_id=None):
    """
    Return a list of queries to remove data-poor participant rows.

    The removal criteria is for participants is as follows:
    1. They have not completed "The Basics" PPI module, via the RDR
    2. They do not have any EHR data

    These participants are not particularly useful for analysis, so remove them
    here.
    :param sandbox_dataset_id: Identifies the sandbox dataset to store rows 
    #TODO use sandbox_dataset_id for CR

    :return:  A list of string queries that can be executed to delete data-poor
        participants and corresponding rows from the dataset.
    """
    fields = [
        'person.' + field['name'] for field in resources.fields_for('person')
    ]
    mapped_clinical_data_configs = {
        t: {
            'id_column': resources.get_domain_id_field(t)
        } for t in common.MAPPED_CLINICAL_DATA_TABLES
    }

    delete_query = SELECT_PERSON_WITH_BASICS_OR_EHR_TMPL.render(
        fields=fields,
        project=project,
        dataset=dataset,
        basics_module_concept_id=BASICS_MODULE_CONCEPT_ID,
        mapped_clinical_data_configs=mapped_clinical_data_configs)

    # drop from the person table, then delete all corresponding data for the now missing persons
    return [{
        clean_consts.QUERY: delete_query,
        clean_consts.DESTINATION_TABLE: 'person',
        clean_consts.DESTINATION_DATASET: dataset,
        clean_consts.DISPOSITION: bq_consts.WRITE_TRUNCATE,
    }] + drop_rows_for_missing_persons.get_queries(project, dataset)
 def test_get_domain_id_field(self):
     self.assertEqual(resources.get_domain_id_field(self.condition_table),
                      self.condition_occurrence_id)
     self.assertEqual(resources.get_domain_id_field(self.procedure_table),
                      self.procedure_occurrence_id)