def add_snpmapinfo_schemas_in_build_to_schema_bank(dbsnp_mirror_db_info, metadata_connection_handle): with get_db_conn_for_species( dbsnp_mirror_db_info) as dbsnp_build_info_conn: query_to_get_snpmapinfo_tables = "select table_schema, table_name as full_table_name " \ "from information_schema.tables " \ "where lower(table_name) like 'b%snpmapinfo%' " \ "and lower(table_schema) not like '%donotuse' " \ "and lower(table_name) not like '%donotuse'" for result in get_all_results_for_query( dbsnp_build_info_conn, query_to_get_snpmapinfo_tables): snpmapinfo_table_schema, snpmapinfo_table_name = result[0], result[ 1] command_to_get_snpmap_table_definition = \ ("psql -AF ' ' -t -U dbsnp -h {0} -p {1} -d dbsnp_{2} -v ON_ERROR_STOP=1 -c " + '"\\d {3}.{4}"')\ .format(dbsnp_mirror_db_info["pg_host"], dbsnp_mirror_db_info["pg_port"], dbsnp_mirror_db_info["dbsnp_build"], snpmapinfo_table_schema, snpmapinfo_table_name) snpmapinfo_table_definition = run_command_with_output( "Get SNPMapInfo table definition for " + snpmapinfo_table_name, command_to_get_snpmap_table_definition, return_process_output=True) insert_into_snpmapinfo_schema_bank( snpmapinfo_table_schema, snpmapinfo_table_name, snpmapinfo_table_definition.strip(), metadata_connection_handle)
def get_snpmapinfo_asm_columns(species_info, snpmapinfo_table_name): accession_columns = ['asm_acc', 'asm_version'] assembly_name_columns = ['assembly'] with get_db_conn_for_species(species_info) as pg_conn_for_species: accession_columns_query = "select distinct table_schema from information_schema.columns " \ "where table_schema = 'dbsnp_{0}' and table_name = '{1}' " \ "and column_name in ({2});".format(species_info["database_name"], snpmapinfo_table_name, ",".join(["'{0}'".format(name) for name in accession_columns])) assembly_name_columns_query = "select distinct table_schema from information_schema.columns " \ "where table_schema = 'dbsnp_{0}' and table_name = '{1}' " \ "and column_name in ({2});".format(species_info["database_name"], snpmapinfo_table_name, ",".join(["'{0}'".format(name) for name in assembly_name_columns])) available_column_names = get_all_results_for_query(pg_conn_for_species, accession_columns_query) if len(available_column_names) > 0: return [accession_columns[0], accession_columns[1]] available_column_names = get_all_results_for_query(pg_conn_for_species, assembly_name_columns_query) if len(available_column_names) > 0: return [assembly_name_columns[0]] else: logger.error("No assembly related column names found for the table dbsnp_{0}.{1}" .format(species_info["database_name"], snpmapinfo_table_name)) return []
def attempt_snpmap_import_with_ddl(snpmapinfo_file_name, snpmapinfo_table_ddl, species_info): with get_db_conn_for_species( species_info) as species_data_connection_handle: snpmapinfo_table_name = "dbsnp_{0}.{1}_snpmapinfo".format( species_info["database_name"], get_build_version_from_file_name(snpmapinfo_file_name)) snpmapinfo_table_creation_query = "create table {0} ({1})".format( snpmapinfo_table_name, snpmapinfo_table_ddl) snpmapinfo_table_drop_query = "drop table {0}".format( snpmapinfo_table_name) with species_data_connection_handle.cursor() as cursor, gzip.open(snpmapinfo_file_name) \ as snpmapinfo_file_handle: try: cursor.execute(snpmapinfo_table_creation_query) species_data_connection_handle.commit() cursor.copy_from(snpmapinfo_file_handle, snpmapinfo_table_name, sep="\t", null="") except (psycopg2.DataError, psycopg2.ProgrammingError) as error: logger.error(error) species_data_connection_handle.rollback() cursor.execute(snpmapinfo_table_drop_query) species_data_connection_handle.commit() return False return True
def create_multimap_snp_table_and_indices(metadata_connection_handle, dbsnp_species_name, species_info): union_of_snpmapinfo_tables_query = " union all ".join([ "select snp_id, weight, {0} as assembly from dbsnp_{1}.{2} where weight > 1" .format( "||'.'||".join(get_snpmapinfo_asm_columns(species_info, table_name)), dbsnp_species_name, table_name) for table_name in get_snpmapinfo_tables_with_overweight_snps_for_dbsnp_species( metadata_connection_handle, dbsnp_species_name) ]) if len(union_of_snpmapinfo_tables_query) > 0: multimap_snp_table_name = "multimap_snps" table_creation_query = """ create table if not exists dbsnp_{0}.{1} as (select distinct * from ({2}) temp); """.format(dbsnp_species_name, multimap_snp_table_name, union_of_snpmapinfo_tables_query) with get_db_conn_for_species( species_info) as species_connection_handle: logger.info("Executing query: " + table_creation_query) execute_query(species_connection_handle, table_creation_query) for column in ["snp_id", "weight", "assembly"]: create_index_on_table(species_connection_handle, "dbsnp_" + dbsnp_species_name, multimap_snp_table_name, [column]) vacuum_analyze_table(species_connection_handle, "dbsnp_" + dbsnp_species_name, multimap_snp_table_name, [column]) execute_query( species_connection_handle, "grant select on dbsnp_{0}.{1} to dbsnp_ro".format( dbsnp_species_name, multimap_snp_table_name))
def get_snpmapinfo_table_names_for_species(species_info): query = """select table_name from information_schema.tables where lower(table_name) like '%snpmapinfo%' and table_schema = 'dbsnp_{0}' """ with get_db_conn_for_species(species_info) as pg_conn_for_species: results = get_all_results_for_query(pg_conn_for_species, query.format(species_info["database_name"])) if len(results) > 0: if len(results[0]) > 0: return [result[0] for result in results] return []
def incorporate_mapping_weight_for_assembly( accessioning_mongo_handle: pymongo.MongoClient, metadata_connection_handle: pymongo.MongoClient, GCA_accession: str): accessioning_database_handle = accessioning_mongo_handle[ eva_accession_database] # Since we create a single table for multimap SNPs for each species in cre # the individual table name does not matter schema_assembly_association = set([ (schema_name, assembly_from_dbsnp) for schema_name, _, assembly_from_dbsnp in get_snpmapinfo_tables_with_GCA_assembly( metadata_connection_handle, GCA_accession) ]) for schema_name, assembly_from_dbsnp in schema_assembly_association: species_info = get_species_info(metadata_connection_handle, schema_name)[0] query_to_get_mapping_weight = get_mapping_weight_query( assembly_from_dbsnp, schema_name) logger.info("Running query to get mapping weight: " + query_to_get_mapping_weight) with get_db_conn_for_species(species_info) as species_connection_handle, \ get_result_cursor(species_connection_handle, query_to_get_mapping_weight) as cursor: for result in cursor: snp_id, weight = result[0], result[1] for collection_name, collection_attributes in collections_to_update.items( ): collection_handle = accessioning_database_handle[ collection_name] collection_attributes["update_statements"].extend( get_collection_update_statements( collection_name, collection_handle, collection_attributes["assembly_attribute_name"], GCA_accession, collection_attributes[ "rs_accession_attribute_name"], snp_id, collection_attributes[ "mapping_weight_attribute_path"], weight)) if bulk_update(collection_handle, collection_attributes["update_statements"]): # Reset the list of update statements for this collection after a successful bulk update collections_to_update[collection_name][ "update_statements"] = [] # Perform any residual updates for collection_name, collection_attributes in collections_to_update.items( ): update_statements = collection_attributes["update_statements"] if len(update_statements) > 0: bulk_update(accessioning_database_handle[collection_name], update_statements, force_update=True)
def create_snpmapinfo_indexes(metadata_db_name, metadata_db_user, metadata_db_host): with get_pg_connection_handle(metadata_db_name, metadata_db_user, metadata_db_host) as metadata_connection_handle: for species_info in get_species_info(metadata_connection_handle): for snpmapinfo_table_name in get_snpmapinfo_table_names_for_species(species_info): if "asm_acc" in get_snpmapinfo_asm_columns(species_info, snpmapinfo_table_name): index_columns = [("asm_acc", "asm_version")] else: index_columns = [("assembly",)] index_columns += [("weight",)] with get_db_conn_for_species(species_info) as species_connection_handle: for columns in index_columns: create_index_on_table(species_connection_handle, "dbsnp_" + species_info["database_name"], snpmapinfo_table_name, columns) vacuum_analyze_table(species_connection_handle, "dbsnp_" + species_info["database_name"], snpmapinfo_table_name, columns)
def get_distinct_asm_with_overweight_snps_in_snpmapinfo_table(snpmapinfo_table_name, species_info, weight_threshold=1): with get_db_conn_for_species(species_info) as species_db_connection_handle: asm_columns = get_snpmapinfo_asm_columns(species_info, snpmapinfo_table_name) if "asm_acc" in asm_columns: distinct_asm_query = "select distinct asm_acc || '.' || asm_version as assembly " \ "from dbsnp_{0}.{1}" type_of_asm = "assembly_accession" else: distinct_asm_query = "select distinct assembly from dbsnp_{0}.{1}" type_of_asm = "assembly_name" distinct_asm_query += " where weight > {0} and assembly is not null order by 1".format(weight_threshold) results = get_all_results_for_query(species_db_connection_handle, distinct_asm_query.format(species_info["database_name"], snpmapinfo_table_name)) return [result[0] for result in results], type_of_asm
def export_snpmapinfo_for_species(species_info, metadata_connection_handle, mapping_weight_threshold, export_dir): exported_filenames_and_assemblies = [] species_name = species_info["database_name"] weight_criteria_query = "select 'rs'||trim(cast(snp_id as text)) from dbsnp_{0}.{1} where weight > " + \ str(mapping_weight_threshold) copy_statement = "COPY ({0}) TO STDOUT WITH CSV" with get_db_conn_for_species(species_info) as species_db_connection_handle: snpmapinfo_table_names = get_snpmapinfo_table_names_for_species( species_info) for snpmapinfo_table_name in snpmapinfo_table_names: distinct_asm, type_of_asm = get_distinct_asm_with_overweight_snps_in_snpmapinfo_table( snpmapinfo_table_name, species_info) for asm in distinct_asm: if type_of_asm == "assembly_name": specific_query = weight_criteria_query + " and assembly = '{0}'".format( asm) else: specific_query = weight_criteria_query + " and asm_acc = '{0}' and asm_version = '{1}'"\ .format(asm.split(".")[0], asm.split(".")[1]) specific_query = specific_query.format( species_name, snpmapinfo_table_name) + " order by 1" try: associated_GCA_assembly = lookup_GCA_assembly( species_name, snpmapinfo_table_name, asm, metadata_connection_handle) except Exception as e: logger.error(e) continue output_file_name = os.path.join( export_dir, "{0}_{1}_{2}_overweight_snps.csv".format( species_name, snpmapinfo_table_name, associated_GCA_assembly)) with open(output_file_name, 'w') \ as output_file_handle: get_result_cursor(species_db_connection_handle, specific_query)\ .copy_expert(copy_statement.format(specific_query), output_file_handle) exported_filenames_and_assemblies.append( (output_file_name, get_build_version_from_file_name( snpmapinfo_table_name), associated_GCA_assembly)) return exported_filenames_and_assemblies
def remove_snpmapinfo_table_name_suffixes(metadata_db_name, metadata_db_user, metadata_db_host): with get_pg_connection_handle( metadata_db_name, metadata_db_user, metadata_db_host) as metadata_connection_handle: for species_info in get_species_info(metadata_connection_handle): for snpmapinfo_table_name in get_snpmapinfo_table_names_for_species( species_info): table_name_components = list( filter(lambda x: x.strip() != "", snpmapinfo_table_name.lower().split("snpmapinfo"))) if len(table_name_components) > 1: with get_db_conn_for_species( species_info) as species_db_connection_handle: rename_query = "alter table dbsnp_{0}.{1} rename to {2}"\ .format(species_info["database_name"], snpmapinfo_table_name, "".join(table_name_components[:-1]) + "snpmapinfo") logger.info("Running rename query: " + rename_query) execute_query(species_db_connection_handle, rename_query) species_db_connection_handle.commit()