def add_snpmapinfo_schemas_in_build_to_schema_bank(dbsnp_mirror_db_info,
                                                   metadata_connection_handle):
    with get_db_conn_for_species(
            dbsnp_mirror_db_info) as dbsnp_build_info_conn:
        query_to_get_snpmapinfo_tables = "select table_schema, table_name as full_table_name " \
                                         "from information_schema.tables " \
                                         "where lower(table_name) like 'b%snpmapinfo%' " \
                                            "and lower(table_schema) not like '%donotuse' " \
                                            "and lower(table_name) not like '%donotuse'"
        for result in get_all_results_for_query(
                dbsnp_build_info_conn, query_to_get_snpmapinfo_tables):
            snpmapinfo_table_schema, snpmapinfo_table_name = result[0], result[
                1]
            command_to_get_snpmap_table_definition = \
                ("psql -AF ' ' -t -U dbsnp -h {0} -p {1} -d dbsnp_{2} -v ON_ERROR_STOP=1 -c " + '"\\d {3}.{4}"')\
                .format(dbsnp_mirror_db_info["pg_host"], dbsnp_mirror_db_info["pg_port"],
                        dbsnp_mirror_db_info["dbsnp_build"], snpmapinfo_table_schema, snpmapinfo_table_name)
            snpmapinfo_table_definition = run_command_with_output(
                "Get SNPMapInfo table definition for " + snpmapinfo_table_name,
                command_to_get_snpmap_table_definition,
                return_process_output=True)
            insert_into_snpmapinfo_schema_bank(
                snpmapinfo_table_schema, snpmapinfo_table_name,
                snpmapinfo_table_definition.strip(),
                metadata_connection_handle)
def get_snpmapinfo_asm_columns(species_info, snpmapinfo_table_name):
    accession_columns = ['asm_acc', 'asm_version']
    assembly_name_columns = ['assembly']
    with get_db_conn_for_species(species_info) as pg_conn_for_species:
        accession_columns_query = "select distinct table_schema from information_schema.columns " \
                                  "where table_schema = 'dbsnp_{0}' and table_name = '{1}' " \
                                  "and column_name in ({2});".format(species_info["database_name"],
                                                                     snpmapinfo_table_name,
                                                                     ",".join(["'{0}'".format(name)
                                                                               for name in accession_columns]))
        assembly_name_columns_query = "select distinct table_schema from information_schema.columns " \
                                      "where table_schema = 'dbsnp_{0}' and table_name = '{1}' " \
                                      "and column_name in ({2});".format(species_info["database_name"],
                                                                         snpmapinfo_table_name,
                                                                         ",".join(["'{0}'".format(name)
                                                                                   for name in assembly_name_columns]))
        available_column_names = get_all_results_for_query(pg_conn_for_species, accession_columns_query)
        if len(available_column_names) > 0:
            return [accession_columns[0], accession_columns[1]]

        available_column_names = get_all_results_for_query(pg_conn_for_species, assembly_name_columns_query)
        if len(available_column_names) > 0:
            return [assembly_name_columns[0]]
        else:
            logger.error("No assembly related column names found for the table dbsnp_{0}.{1}"
                         .format(species_info["database_name"], snpmapinfo_table_name))
            return []
def attempt_snpmap_import_with_ddl(snpmapinfo_file_name, snpmapinfo_table_ddl,
                                   species_info):
    with get_db_conn_for_species(
            species_info) as species_data_connection_handle:
        snpmapinfo_table_name = "dbsnp_{0}.{1}_snpmapinfo".format(
            species_info["database_name"],
            get_build_version_from_file_name(snpmapinfo_file_name))
        snpmapinfo_table_creation_query = "create table {0} ({1})".format(
            snpmapinfo_table_name, snpmapinfo_table_ddl)
        snpmapinfo_table_drop_query = "drop table {0}".format(
            snpmapinfo_table_name)
        with species_data_connection_handle.cursor() as cursor, gzip.open(snpmapinfo_file_name) \
                as snpmapinfo_file_handle:
            try:
                cursor.execute(snpmapinfo_table_creation_query)
                species_data_connection_handle.commit()
                cursor.copy_from(snpmapinfo_file_handle,
                                 snpmapinfo_table_name,
                                 sep="\t",
                                 null="")
            except (psycopg2.DataError, psycopg2.ProgrammingError) as error:
                logger.error(error)
                species_data_connection_handle.rollback()
                cursor.execute(snpmapinfo_table_drop_query)
                species_data_connection_handle.commit()
                return False
            return True
def create_multimap_snp_table_and_indices(metadata_connection_handle,
                                          dbsnp_species_name, species_info):
    union_of_snpmapinfo_tables_query = " union all ".join([
        "select snp_id, weight, {0} as assembly from dbsnp_{1}.{2} where weight > 1"
        .format(
            "||'.'||".join(get_snpmapinfo_asm_columns(species_info,
                                                      table_name)),
            dbsnp_species_name, table_name) for table_name in
        get_snpmapinfo_tables_with_overweight_snps_for_dbsnp_species(
            metadata_connection_handle, dbsnp_species_name)
    ])
    if len(union_of_snpmapinfo_tables_query) > 0:
        multimap_snp_table_name = "multimap_snps"
        table_creation_query = """            
                create table if not exists dbsnp_{0}.{1} as (select distinct * from ({2}) temp);
                """.format(dbsnp_species_name, multimap_snp_table_name,
                           union_of_snpmapinfo_tables_query)
        with get_db_conn_for_species(
                species_info) as species_connection_handle:
            logger.info("Executing query: " + table_creation_query)
            execute_query(species_connection_handle, table_creation_query)
            for column in ["snp_id", "weight", "assembly"]:
                create_index_on_table(species_connection_handle,
                                      "dbsnp_" + dbsnp_species_name,
                                      multimap_snp_table_name, [column])
                vacuum_analyze_table(species_connection_handle,
                                     "dbsnp_" + dbsnp_species_name,
                                     multimap_snp_table_name, [column])
                execute_query(
                    species_connection_handle,
                    "grant select on dbsnp_{0}.{1} to dbsnp_ro".format(
                        dbsnp_species_name, multimap_snp_table_name))
def get_snpmapinfo_table_names_for_species(species_info):
    query = """select table_name from information_schema.tables 
    where lower(table_name) like '%snpmapinfo%' and table_schema = 'dbsnp_{0}'
    """
    with get_db_conn_for_species(species_info) as pg_conn_for_species:
        results = get_all_results_for_query(pg_conn_for_species, query.format(species_info["database_name"]))
        if len(results) > 0:
            if len(results[0]) > 0:
                return [result[0] for result in results]

    return []
def incorporate_mapping_weight_for_assembly(
        accessioning_mongo_handle: pymongo.MongoClient,
        metadata_connection_handle: pymongo.MongoClient, GCA_accession: str):
    accessioning_database_handle = accessioning_mongo_handle[
        eva_accession_database]

    # Since we create a single table for multimap SNPs for each species in cre
    # the individual table name does not matter
    schema_assembly_association = set([
        (schema_name, assembly_from_dbsnp) for schema_name, _,
        assembly_from_dbsnp in get_snpmapinfo_tables_with_GCA_assembly(
            metadata_connection_handle, GCA_accession)
    ])
    for schema_name, assembly_from_dbsnp in schema_assembly_association:
        species_info = get_species_info(metadata_connection_handle,
                                        schema_name)[0]
        query_to_get_mapping_weight = get_mapping_weight_query(
            assembly_from_dbsnp, schema_name)
        logger.info("Running query to get mapping weight: " +
                    query_to_get_mapping_weight)
        with get_db_conn_for_species(species_info) as species_connection_handle, \
                get_result_cursor(species_connection_handle, query_to_get_mapping_weight) as cursor:
            for result in cursor:
                snp_id, weight = result[0], result[1]
                for collection_name, collection_attributes in collections_to_update.items(
                ):
                    collection_handle = accessioning_database_handle[
                        collection_name]
                    collection_attributes["update_statements"].extend(
                        get_collection_update_statements(
                            collection_name, collection_handle,
                            collection_attributes["assembly_attribute_name"],
                            GCA_accession, collection_attributes[
                                "rs_accession_attribute_name"], snp_id,
                            collection_attributes[
                                "mapping_weight_attribute_path"], weight))
                    if bulk_update(collection_handle,
                                   collection_attributes["update_statements"]):
                        # Reset the list of update statements for this collection after a successful bulk update
                        collections_to_update[collection_name][
                            "update_statements"] = []

    # Perform any residual updates
    for collection_name, collection_attributes in collections_to_update.items(
    ):
        update_statements = collection_attributes["update_statements"]
        if len(update_statements) > 0:
            bulk_update(accessioning_database_handle[collection_name],
                        update_statements,
                        force_update=True)
def create_snpmapinfo_indexes(metadata_db_name, metadata_db_user, metadata_db_host):
    with get_pg_connection_handle(metadata_db_name, metadata_db_user, metadata_db_host) as metadata_connection_handle:
        for species_info in get_species_info(metadata_connection_handle):
            for snpmapinfo_table_name in get_snpmapinfo_table_names_for_species(species_info):
                if "asm_acc" in get_snpmapinfo_asm_columns(species_info, snpmapinfo_table_name):
                    index_columns = [("asm_acc", "asm_version")]
                else:
                    index_columns = [("assembly",)]

                index_columns += [("weight",)]
                with get_db_conn_for_species(species_info) as species_connection_handle:
                    for columns in index_columns:
                        create_index_on_table(species_connection_handle, "dbsnp_" + species_info["database_name"],
                                              snpmapinfo_table_name, columns)
                        vacuum_analyze_table(species_connection_handle, "dbsnp_" + species_info["database_name"],
                                             snpmapinfo_table_name, columns)
def get_distinct_asm_with_overweight_snps_in_snpmapinfo_table(snpmapinfo_table_name, species_info,
                                                              weight_threshold=1):
    with get_db_conn_for_species(species_info) as species_db_connection_handle:
        asm_columns = get_snpmapinfo_asm_columns(species_info, snpmapinfo_table_name)
        if "asm_acc" in asm_columns:
            distinct_asm_query = "select distinct asm_acc || '.' || asm_version as assembly " \
                                 "from dbsnp_{0}.{1}"
            type_of_asm = "assembly_accession"
        else:
            distinct_asm_query = "select distinct assembly from dbsnp_{0}.{1}"
            type_of_asm = "assembly_name"
        distinct_asm_query += " where weight > {0} and assembly is not null order by 1".format(weight_threshold)
        results = get_all_results_for_query(species_db_connection_handle,
                                            distinct_asm_query.format(species_info["database_name"],
                                                                      snpmapinfo_table_name))
        return [result[0] for result in results], type_of_asm
def export_snpmapinfo_for_species(species_info, metadata_connection_handle,
                                  mapping_weight_threshold, export_dir):
    exported_filenames_and_assemblies = []
    species_name = species_info["database_name"]

    weight_criteria_query = "select 'rs'||trim(cast(snp_id as text)) from dbsnp_{0}.{1} where weight > " + \
                            str(mapping_weight_threshold)
    copy_statement = "COPY ({0}) TO STDOUT WITH CSV"

    with get_db_conn_for_species(species_info) as species_db_connection_handle:
        snpmapinfo_table_names = get_snpmapinfo_table_names_for_species(
            species_info)
        for snpmapinfo_table_name in snpmapinfo_table_names:
            distinct_asm, type_of_asm = get_distinct_asm_with_overweight_snps_in_snpmapinfo_table(
                snpmapinfo_table_name, species_info)
            for asm in distinct_asm:
                if type_of_asm == "assembly_name":
                    specific_query = weight_criteria_query + " and assembly = '{0}'".format(
                        asm)
                else:
                    specific_query = weight_criteria_query + " and asm_acc = '{0}' and asm_version = '{1}'"\
                        .format(asm.split(".")[0], asm.split(".")[1])
                specific_query = specific_query.format(
                    species_name, snpmapinfo_table_name) + " order by 1"
                try:
                    associated_GCA_assembly = lookup_GCA_assembly(
                        species_name, snpmapinfo_table_name, asm,
                        metadata_connection_handle)
                except Exception as e:
                    logger.error(e)
                    continue
                output_file_name = os.path.join(
                    export_dir, "{0}_{1}_{2}_overweight_snps.csv".format(
                        species_name, snpmapinfo_table_name,
                        associated_GCA_assembly))
                with open(output_file_name, 'w') \
                        as output_file_handle:
                    get_result_cursor(species_db_connection_handle, specific_query)\
                        .copy_expert(copy_statement.format(specific_query), output_file_handle)
                    exported_filenames_and_assemblies.append(
                        (output_file_name,
                         get_build_version_from_file_name(
                             snpmapinfo_table_name), associated_GCA_assembly))

    return exported_filenames_and_assemblies
def remove_snpmapinfo_table_name_suffixes(metadata_db_name, metadata_db_user,
                                          metadata_db_host):
    with get_pg_connection_handle(
            metadata_db_name, metadata_db_user,
            metadata_db_host) as metadata_connection_handle:
        for species_info in get_species_info(metadata_connection_handle):
            for snpmapinfo_table_name in get_snpmapinfo_table_names_for_species(
                    species_info):
                table_name_components = list(
                    filter(lambda x: x.strip() != "",
                           snpmapinfo_table_name.lower().split("snpmapinfo")))
                if len(table_name_components) > 1:
                    with get_db_conn_for_species(
                            species_info) as species_db_connection_handle:
                        rename_query = "alter table dbsnp_{0}.{1} rename to {2}"\
                            .format(species_info["database_name"], snpmapinfo_table_name,
                                    "".join(table_name_components[:-1]) + "snpmapinfo")
                        logger.info("Running rename query: " + rename_query)
                        execute_query(species_db_connection_handle,
                                      rename_query)
                        species_db_connection_handle.commit()