def collect_mongo_genbank_contigs(private_config_xml_file, assembly_accession): try: with psycopg2.connect(get_pg_metadata_uri_for_eva_profile("development", private_config_xml_file), user="******") \ as metadata_connection_handle, MongoClient(get_mongo_uri_for_eva_profile("development", private_config_xml_file)) \ as mongo_connection_handle: main_collections = [ "dbsnpSubmittedVariantEntity", "submittedVariantEntity" ] for collection in main_collections: insert_contig_info_to_db(collection, assembly_accession, metadata_connection_handle, mongo_connection_handle) ops_collections = [ "dbsnpSubmittedVariantOperationEntity", "submittedVariantOperationEntity" ] for collection in ops_collections: insert_contig_info_to_db( collection, assembly_accession, metadata_connection_handle, mongo_connection_handle, assembly_attribute_prefix="inactiveObjects.") except Exception: logger.error(traceback.format_exc())
def export_all_multimap_snps_from_dbsnp_dumps(private_config_xml_file): result_file = "all_multimap_snp_ids_from_dbsnp_dumps.txt" with psycopg2.connect(get_pg_metadata_uri_for_eva_profile("production", private_config_xml_file), user="******") \ as metadata_connection_handle: assembly_GCA_accession_map = get_assemblies_with_multimap_snps_for_species( metadata_connection_handle) for species_info in get_species_info(metadata_connection_handle): species_name = species_info["database_name"] logger.info("Processing species {0}...".format(species_name)) if species_name in assembly_GCA_accession_map: with get_db_conn_for_species( species_info) as species_connection_handle: export_query = "select snp_id, assembly from dbsnp_{0}.multimap_snps " \ "where assembly in ({1})"\ .format(species_name,",".join(["'{0}'".format(assembly) for assembly in assembly_GCA_accession_map[species_name].keys()])) logger.info("Running export query: " + export_query) with open(result_file, 'a') as result_file_handle: for snp_id, assembly in get_result_cursor( species_connection_handle, export_query): result_file_handle.write("{0},{1}\n".format( snp_id, assembly_GCA_accession_map[species_name] [assembly])) run_command_with_output( "Sorting multimap SNP IDs from dbSNP source dumps...", "sort -u {0} -o {0}".format(result_file))
def copy_accessioning_collections_to_embassy(private_config_xml_file, profile, taxonomy_id, assembly_accession, collections_to_copy, release_species_inventory_table, release_version, dump_dir): port_forwarding_process_id, mongo_port, exit_code = None, None, None try: port_forwarding_process_id, mongo_port = open_mongo_port_to_tempmongo(private_config_xml_file, profile, taxonomy_id, assembly_accession, release_species_inventory_table, release_version) with psycopg2.connect(get_pg_metadata_uri_for_eva_profile(profile, private_config_xml_file), user="******") as \ metadata_connection_handle: # To be idempotent, clear destination tempmongo database destination_db_name = get_release_db_name_in_tempmongo_instance(taxonomy_id) MongoClient(port=mongo_port).drop_database(destination_db_name) release_info = get_release_inventory_info_for_assembly(taxonomy_id, assembly_accession, release_species_inventory_table, release_version, metadata_connection_handle) logger.info("Beginning data copy to remote MongoDB host {0} on port {1}..." .format(release_info["tempmongo_instance"], mongo_port)) collections_to_copy_map = get_collections_to_copy(collections_to_copy, sources=release_info["sources"]) mongo_data_copy_to_remote_host(mongo_port, private_config_xml_file, profile, assembly_accession, collections_to_copy_map, dump_dir, destination_db_name) exit_code = 0 except Exception as ex: logger.error("Encountered an error while copying species data to Embassy for assemblies in " + release_info["tempmongo_instance"] + "\n" + traceback.format_exc()) exit_code = -1 finally: close_mongo_port_to_tempmongo(port_forwarding_process_id) logger.info("Copy process completed with exit_code: " + str(exit_code)) sys.exit(exit_code)
def merge_dbsnp_eva_release_files(private_config_xml_file, profile, bgzip_path, bcftools_path, vcf_sort_script_path, taxonomy_id, assembly_accession, release_species_inventory_table, release_version, species_release_folder): with psycopg2.connect(get_pg_metadata_uri_for_eva_profile(profile, private_config_xml_file), user="******") \ as metadata_connection_handle: release_info = get_release_inventory_info_for_assembly( taxonomy_id, assembly_accession, release_species_inventory_table, release_version, metadata_connection_handle) merge_commands = [] for vcf_file_category in release_vcf_file_categories: merge_commands.extend( merge_dbsnp_eva_vcf_files(bgzip_path, bcftools_path, vcf_sort_script_path, assembly_accession, species_release_folder, vcf_file_category, release_info["sources"])) for text_release_file_category in release_text_file_categories: merge_commands.extend( merge_dbsnp_eva_text_files(assembly_accession, species_release_folder, text_release_file_category, release_info["sources"])) final_merge_command = " && ".join(merge_commands) run_command_with_output( "Merging dbSNP and EVA release files for assembly: " + assembly_accession, final_merge_command)
def main(): parser = argparse.ArgumentParser( description= 'Get possible assemblies where given Genbank contigs are present', formatter_class=argparse.RawTextHelpFormatter, add_help=False) parser.add_argument( "--private-config-xml-file", help= "Full path to private configuration file (ex: /path/to/settings.xml)", required=True) parser.add_argument("--eutils-api-key", help="EUtils API key", required=True) args = parser.parse_args() Entrez.api_key = args.eutils_api_key create_table_to_collect_possible_assemblies(args.private_config_xml_file) with psycopg2.connect(get_pg_metadata_uri_for_eva_profile("development", args.private_config_xml_file), user="******") \ as metadata_connection_handle: for contig_accession in sys.stdin: contig_accession = contig_accession.strip() logger.info( f"Getting possible assemblies for {contig_accession} from EUtils..." ) possible_assemblies = get_assemblies_where_contig_appears( contig_accession) insert_possible_assemblies_for_contig(metadata_connection_handle, contig_accession, possible_assemblies)
def run_release_for_species(common_release_properties_file, taxonomy_id, memory): common_release_properties = get_common_release_properties( common_release_properties_file) private_config_xml_file = common_release_properties[ "private-config-xml-file"] release_species_inventory_table = common_release_properties[ "release-species-inventory-table"] release_version = common_release_properties["release-version"] with psycopg2.connect(get_pg_metadata_uri_for_eva_profile("development", private_config_xml_file), user="******") \ as metadata_connection_handle: release_assemblies = get_release_assemblies_for_taxonomy( taxonomy_id, release_species_inventory_table, release_version, metadata_connection_handle) workflow_file_name, release_log_file = prepare_release_workflow_file_for_species( common_release_properties, taxonomy_id, release_assemblies, memory) workflow_report_file_name = workflow_file_name.replace( ".nf", ".report.html") if os.path.exists(workflow_report_file_name): os.remove(workflow_report_file_name) workflow_command = "cd {0} && {1} run {2} -c {3} -with-report {4} -bg".format( os.path.dirname(release_log_file), common_release_properties["nextflow-binary-path"], workflow_file_name, common_release_properties["nextflow-config-path"], workflow_report_file_name) logger.info("Check log file in: " + release_log_file + " to monitor progress...") logger.info( "Running workflow file {0} with the following command:\n\n {1} \n\n" "Use the above command with -resume if this workflow needs to be resumed in the future" .format(workflow_file_name, workflow_command)) os.system(workflow_command)
def get_assemblies_from_evapro(private_config_xml_file): metadata_handle = psycopg2.connect(get_pg_metadata_uri_for_eva_profile( "development", private_config_xml_file), user="******") query = "select assembly_accession from accessioned_assembly where assembly_accession like 'GCA%'" evapro_assemblies = get_all_results_for_query(metadata_handle, query) return [asm[0] for asm in evapro_assemblies]
def create_table_to_collect_assembly_report_genbank_contigs(private_config_xml_file): with psycopg2.connect(get_pg_metadata_uri_for_eva_profile("development", private_config_xml_file), user="******") \ as metadata_connection_handle: create_table_to_store_asm_report_contigs_query = "create table if not exists {0} " \ "(assembly_accession text, contig_accession text, " \ "chromosome_name text)".format(asm_report_contigs_table_name) execute_query(metadata_connection_handle, create_table_to_store_asm_report_contigs_query)
def create_table_to_collect_possible_assemblies(private_config_xml_file): with psycopg2.connect(get_pg_metadata_uri_for_eva_profile("development", private_config_xml_file), user="******") \ as metadata_connection_handle: create_table_to_store_possible_assemblies_query = "create table if not exists {0} " \ "(genbank_accession text, assembly_accession text, " \ "primary key (genbank_accession, assembly_accession))"\ .format(possible_assemblies_table_name) execute_query(metadata_connection_handle, create_table_to_store_possible_assemblies_query)
def create_table_accession_counts(private_config_xml_file): with psycopg2.connect(get_pg_metadata_uri_for_eva_profile( "development", private_config_xml_file), user="******") as metadata_connection_handle: query_create_table = ( 'CREATE TABLE IF NOT EXISTS eva_stats.submitted_variants_load_counts ' '(source TEXT, taxid INTEGER, assembly_accession TEXT, project_accession TEXT, date_loaded TIMESTAMP, ' 'number_submitted_variants BIGINT NOT NULL, ' 'primary key(taxid, assembly_accession, project_accession, date_loaded))' ) execute_query(metadata_connection_handle, query_create_table)
def create_table_for_count_validation(private_config_xml_file): with psycopg2.connect(get_pg_metadata_uri_for_eva_profile( "development", private_config_xml_file), user="******") as metadata_connection_handle: query_create_table_for_count_validation = "create table if not exists {0} " \ "(mongo_host text, database text, collection text, " \ "document_count bigint not null, report_time timestamp, " \ "primary key(mongo_host, database, collection, report_time))" \ .format(mongo_migration_count_validation_table_name) execute_query(metadata_connection_handle, query_create_table_for_count_validation)
def insert_count_validation_result_to_db(private_config_xml_file, count_validation_res_list): if len(count_validation_res_list) > 0: with psycopg2.connect(get_pg_metadata_uri_for_eva_profile( "development", private_config_xml_file), user="******") as metadata_connection_handle: with metadata_connection_handle.cursor() as cursor: psycopg2.extras.execute_values( cursor, "INSERT INTO {0} " "(mongo_host, database, collection, document_count,report_time) " "VALUES %s".format( mongo_migration_count_validation_table_name), [count_validation_res_list])
def create_table_for_progress(private_config_xml_file): with psycopg2.connect(get_pg_metadata_uri_for_eva_profile( "development", private_config_xml_file), user="******") as metadata_connection_handle: query_create_table = ( 'CREATE TABLE IF NOT EXISTS remapping_progress ' '(source TEXT, taxid INTEGER, scientific_name TEXT, assembly_accession TEXT, number_of_study INTEGER NOT NULL,' 'number_submitted_variants BIGINT NOT NULL, release_number INTEGER, `target_assembly_accession` TEXT, ' 'report_time TIMESTAMP DEFAULT NOW(), progress_status TEXT, start_time TIMESTAMP, ' 'completion_time TIMESTAMP, remapping_version TEXT, nb_variant_extracted INTEGER, ' 'nb_variant_remapped INTEGER, nb_variant_ingested INTEGER, ' 'primary key(source, taxid, assembly_accession, release_number))') execute_query(metadata_connection_handle, query_create_table)
def create_table_to_collect_mongo_genbank_contigs(private_config_xml_file): with psycopg2.connect(get_pg_metadata_uri_for_eva_profile("development", private_config_xml_file), user="******") \ as metadata_connection_handle: create_table_to_store_asm_report_contigs_query = "create table if not exists {0} " \ "(source text, assembly_accession text, " \ "study text, contig_accession text, chromosome_name text, " \ "num_entries_in_db bigint, is_contig_in_asm_report boolean, " \ "primary key(source, assembly_accession, study, " \ "contig_accession))"\ .format(mongo_genbank_contigs_table_name) execute_query(metadata_connection_handle, create_table_to_store_asm_report_contigs_query)
def update_release_status_for_assembly(private_config_xml_file, taxonomy_id, assembly_accession, release_version): with psycopg2.connect(get_pg_metadata_uri_for_eva_profile( "development", private_config_xml_file), user="******") as metadata_connection_handle: update_release_progress_status(metadata_connection_handle, taxonomy_id, assembly_accession, release_version, release_status='done') logger.info( "Successfully marked release status as 'Done' in {0} for taxonomy {1} and assembly {2}" .format(release_progress_table, taxonomy_id, assembly_accession))
def add_mapping_weight_attribute_for_dbsnp_species(private_config_xml_file, dbsnp_species_name): try: with psycopg2.connect(get_pg_metadata_uri_for_eva_profile("production", private_config_xml_file), user="******") as metadata_connection_handle: dbsnp_species_taxonomy = int(dbsnp_species_name.split("_")[-1]) import_mapping_weight_attribute_for_dbsnp_species(private_config_xml_file, metadata_connection_handle, dbsnp_species_taxonomy) except Exception as ex: logger.error("Encountered an error while adding mapping attribute for " + dbsnp_species_name + "\n" + "\n".join(traceback.format_exception(type(ex), ex, ex.__traceback__))) sys.exit(1) sys.exit(0)
def insert_remapping_progress_to_db(private_config_xml_file, dataframe): list_to_remap = dataframe.values.tolist() if len(list_to_remap) > 0: with psycopg2.connect(get_pg_metadata_uri_for_eva_profile( "development", private_config_xml_file), user="******") as metadata_connection_handle: with metadata_connection_handle.cursor() as cursor: query_insert = ( 'INSERT INTO remapping_progress ' '(source, taxid, scientific_name, assembly_accession, number_of_study, ' 'number_submitted_variants, target_assembly_accession, release_number) ' 'VALUES %s') psycopg2.extras.execute_values(cursor, query_insert, list_to_remap)
def publish_release_files_to_ftp(common_release_properties_file, taxonomy_id): release_properties = ReleaseProperties(common_release_properties_file) create_requisite_folders(release_properties) # Release README, known issues etc., publish_release_top_level_files_to_ftp(release_properties) metadata_password = get_properties_from_xml_file( "development", release_properties.private_config_xml_file)["eva.evapro.password"] with psycopg2.connect( get_pg_metadata_uri_for_eva_profile( "development", release_properties.private_config_xml_file), user="******", password=metadata_password) as metadata_connection_handle: assemblies_to_process = get_release_assemblies_info_for_taxonomy( taxonomy_id, release_properties, metadata_connection_handle) species_has_unmapped_data = "Unmapped" in set([ assembly_info["assembly"] for assembly_info in assemblies_to_process ]) # Publish species level data species_current_release_folder_name, species_previous_release_folder_name = \ get_current_and_previous_release_folders_for_taxonomy(taxonomy_id, release_properties, metadata_connection_handle) create_species_folder(release_properties, species_current_release_folder_name) # Unmapped variant data is published at the species level # because they are not mapped to any assemblies (duh!) if species_has_unmapped_data: publish_species_level_files_to_ftp( release_properties, species_current_release_folder_name, species_previous_release_folder_name, species_has_unmapped_data) # Publish assembly level data for current_release_assembly_info in \ get_release_assemblies_for_release_version(assemblies_to_process, release_properties.release_version): if current_release_assembly_info["assembly"] != "Unmapped": publish_assembly_release_files_to_ftp( current_release_assembly_info, release_properties) # Symlinks with assembly names in the species folder ex: Sorbi1 -> GCA_000003195.1 create_assembly_name_symlinks( get_folder_path_for_species( release_properties.public_ftp_current_release_folder, species_current_release_folder_name))
def get_accession_counts_per_assembly(private_config_xml_file, source): accession_count = {} with psycopg2.connect(get_pg_metadata_uri_for_eva_profile( "development", private_config_xml_file), user="******") as pg_conn: query = ( 'SELECT assembly_accession, taxid, SUM(number_submitted_variants) ' 'FROM eva_stats.submitted_variants_load_counts ' "WHERE source='%s'" 'GROUP BY assembly_accession, taxid ' % source) for assembly_accession, taxid, count_ssid in get_all_results_for_query( pg_conn, query): accession_count[assembly_accession] = (assembly_accession, taxid, count_ssid) return accession_count
def update_release_status_for_assembly(private_config_xml_file, profile, release_species_inventory_table, taxonomy_id, assembly_accession, release_version): with psycopg2.connect(get_pg_metadata_uri_for_eva_profile( profile, private_config_xml_file), user="******") as metadata_connection_handle: update_release_progress_status(metadata_connection_handle, release_species_inventory_table, taxonomy_id, assembly_accession, release_version, release_status='Completed') logger.info( "Successfully marked release status as 'Completed' in {0} for taxonomy {1} and assembly {2}" .format(release_species_inventory_table, taxonomy_id, assembly_accession))
def validate_release_vcf_files(private_config_xml_file, profile, taxonomy_id, assembly_accession, release_species_inventory_table, release_version, species_release_folder, vcf_validator_path, assembly_checker_path): run_command_with_output( "Remove existing VCF validation and assembly report outputs...", "rm -f {0}/{1}/{2} {0}/{1}/{3}".format( species_release_folder, assembly_accession, vcf_validation_output_file_pattern, asm_report_output_file_pattern)) validate_release_vcf_files_commands = [] with psycopg2.connect(get_pg_metadata_uri_for_eva_profile(profile, private_config_xml_file), user="******") as \ metadata_connection_handle: release_inventory_info_for_assembly = get_release_inventory_info_for_assembly( taxonomy_id, assembly_accession, release_species_inventory_table, release_version, metadata_connection_handle) fasta_path = release_inventory_info_for_assembly["fasta_path"] assembly_report_path = release_inventory_info_for_assembly[ "report_path"] remove_index_if_outdated(fasta_path) if assembly_report_path.startswith("file:/"): assembly_report_path = assembly_report_path.replace("file:/", "/") for vcf_file_category in release_vcf_file_categories: release_vcf_file_name = get_release_vcf_file_name( species_release_folder, assembly_accession, vcf_file_category) release_vcf_dir = os.path.dirname(release_vcf_file_name) if "multimap" not in vcf_file_category: validate_release_vcf_files_commands.append( "({0} -i {1} -o {2}) || true".format( vcf_validator_path, release_vcf_file_name, release_vcf_dir)) validate_release_vcf_files_commands.append( "({0} -i {1} -f {2} -a {3} -o {4} -r text,summary) || true" .format(assembly_checker_path, release_vcf_file_name, fasta_path, assembly_report_path, release_vcf_dir)) # We don't expect the validation commands to all pass, hence use semi-colon to run them back to back final_validate_command = " ; ".join( validate_release_vcf_files_commands) run_command_with_output( "Validating release files for assembly: " + assembly_accession, final_validate_command)
def insert_accession_counts_to_db(private_config_xml_file, accession_counts, source): if len(accession_counts) > 0: with psycopg2.connect(get_pg_metadata_uri_for_eva_profile( "development", private_config_xml_file), user="******") as metadata_connection_handle: with metadata_connection_handle.cursor() as cursor: query_insert = ( 'INSERT INTO eva_stats.submitted_variants_load_counts ' '(source, assembly_accession, taxid, project_accession, date_loaded, number_submitted_variants) ' 'VALUES %s ' 'ON CONFLICT (taxid, assembly_accession, project_accession, date_loaded) ' 'DO UPDATE SET number_submitted_variants = EXCLUDED.number_submitted_variants' ) psycopg2.extras.execute_values( cursor, query_insert, accession_counts, ("('" + source + "', %s, %s, %s, %s, %s)"))
def main(private_config_xml_file): private_config_xml_file with psycopg2.connect(get_pg_metadata_uri_for_eva_profile("development", private_config_xml_file), user="******") as pg_conn: query = ( "select distinct origin_assembly_accession, assembly_accession " "from eva_progress_tracker.remapping_tracker " "where origin_assembly_accession!=assembly_accession and num_ss_ids>0" ) source_assembly, target_assembly = get_all_results_for_query(pg_conn, query) source_assembly_info = get_ncbi_assembly_dicts_from_term(source_assembly)[0] target_assembly_info = get_ncbi_assembly_dicts_from_term(target_assembly)[0] source_taxid = source_assembly_info['speciestaxid'] target_taxid = target_assembly_info['speciestaxid'] source_organism = source_assembly_info['organism'] target_organism = target_assembly_info['organism'] if source_taxid != target_taxid: print(f'{source_assembly} and {target_assembly} have different source species {source_organism} != {target_organism}')
def get_release_properties_for_assembly(private_config_xml_file, profile, taxonomy_id, assembly_accession, release_species_inventory_table, release_version, species_release_folder): with psycopg2.connect(get_pg_metadata_uri_for_eva_profile(profile, private_config_xml_file), user="******") as \ metadata_connection_handle: release_inventory_info_for_assembly = get_release_inventory_info_for_assembly(taxonomy_id, assembly_accession, release_species_inventory_table, release_version, metadata_connection_handle) if not release_inventory_info_for_assembly["report_path"].startswith("file:"): release_inventory_info_for_assembly["report_path"] = "file:" + \ release_inventory_info_for_assembly["report_path"] release_inventory_info_for_assembly["output_folder"] = os.path.join(species_release_folder, assembly_accession) release_inventory_info_for_assembly["mongo_accessioning_db"] = \ get_release_db_name_in_tempmongo_instance(taxonomy_id) return merge_two_dicts(release_inventory_info_for_assembly, get_release_job_repo_properties(private_config_xml_file, profile))
def open_mongo_port_to_tempmongo(private_config_xml_file, profile, taxonomy_id, assembly, release_species_inventory_table, release_version): MONGO_PORT = 27017 local_forwarded_port = get_available_local_port(MONGO_PORT) try: with psycopg2.connect(get_pg_metadata_uri_for_eva_profile(profile, private_config_xml_file), user="******") as \ metadata_connection_handle: tempmongo_instance = get_target_mongo_instance_for_assembly( taxonomy_id, assembly, release_species_inventory_table, release_version, metadata_connection_handle) logger.info( "Forwarding remote MongoDB port 27017 to local port {0}...". format(local_forwarded_port)) port_forwarding_process_id = forward_remote_port_to_local_port( tempmongo_instance, MONGO_PORT, local_forwarded_port) return port_forwarding_process_id, local_forwarded_port except Exception: raise Exception( "Encountered an error while opening a port to the remote MongoDB instance: " + tempmongo_instance + "\n" + traceback.format_exc())
def collect_assembly_report_genbank_contigs(private_config_xml_file, assembly_accession): try: with psycopg2.connect(get_pg_metadata_uri_for_eva_profile("development", private_config_xml_file), user="******") \ as metadata_connection_handle: assembly_report_url = assembly_utils.get_assembly_report_url(assembly_accession) assembly_report_file_name = os.path.basename(assembly_report_url) os.system("rm -f " + assembly_report_file_name) wget.download(assembly_report_url) insert_chunk_size = 100 contig_info_list = [] for line in open(assembly_report_file_name, 'r'): if not line.strip().startswith("#"): line_components = line.strip().split("\t") chromosome_name, genbank_accession = line_components[0], line_components[4] contig_info_list.append((assembly_accession, genbank_accession, chromosome_name)) if len(contig_info_list) == insert_chunk_size: insert_contigs_to_db(metadata_connection_handle, contig_info_list) contig_info_list = [] insert_contigs_to_db(metadata_connection_handle, contig_info_list) except Exception: logger.error(traceback.format_exc())
def main(): argparser = ArgumentParser() argparser.add_argument("--private-config-xml-file", help="ex: /path/to/eva-maven-settings.xml", required=True) argparser.add_argument("--assembly_accession", help="GCA_000003205.1", required=True) argparser.add_argument("--assembly_report_path", help="path to the report to check contigs against", required=True) args = argparser.parse_args() genbank_to_row = get_contig_genbank(args.assembly_report_path) log_cfg.add_stdout_handler() with psycopg2.connect(get_pg_metadata_uri_for_eva_profile( "development", args.private_config_xml_file), user="******") as pg_conn: eva_contigs, dbSNP_contigs = get_contigs_accessions_for( pg_conn, args.assembly_accession) for contig in eva_contigs: if contig not in genbank_to_row: logger.warning( 'For assembly {} contig {} found in EVA is not genbank in the report {}' .format(args.assembly_accession, contig, args.assembly_report_path)) for contig in dbSNP_contigs: if contig not in genbank_to_row: logger.warning( 'For assembly {} contig {} found in dbSNP is not genbank in the report {}' .format(args.assembly_accession, contig, args.assembly_report_path)) return 0
def collect_assembly_report_genbank_contigs(private_config_xml_file, assembly_accession): try: with psycopg2.connect(get_pg_metadata_uri_for_eva_profile("development", private_config_xml_file), user="******") \ as metadata_connection_handle: asm = NCBIAssembly(assembly_accession, species_scientific_name=None, reference_directory=None) assembly_report_file_name = os.path.basename( asm.assembly_report_url) os.system("rm -f " + assembly_report_file_name) wget.download(asm.assembly_report_url) insert_chunk_size = 100 contig_info_list = [] for line in open(assembly_report_file_name, 'r'): if not line.strip().startswith("#"): line_components = line.strip().split("\t") chromosome_name, genbank_accession, accession_equivalence, refseq_accession = \ line_components[0], line_components[4], line_components[5], line_components[6] # Equivalence "Relationship" column in the assembly report indicates if # Genbank and RefSeq contig accessions are equivalent is_equivalent_genbank_available = ( accession_equivalence.strip() == "=") contig_info_list.append( (assembly_accession, genbank_accession, chromosome_name, is_equivalent_genbank_available, refseq_accession)) if len(contig_info_list) == insert_chunk_size: insert_contigs_to_db(metadata_connection_handle, contig_info_list) contig_info_list = [] insert_contigs_to_db(metadata_connection_handle, contig_info_list) except Exception: logger.error(traceback.format_exc())
def test_get_pg_metadata_uri_for_eva_profile(self): self.assertEqual( get_pg_metadata_uri_for_eva_profile('test', self.config_file), 'postgresql://pgsql.example.com:5432/testdatabase')
def insert_counts_in_db(private_config_xml_file, metrics_per_assembly, ranges_per_assembly): with psycopg2.connect(get_pg_metadata_uri_for_eva_profile("development", private_config_xml_file), user="******") \ as metadata_connection_handle: for asm in metrics_per_assembly: # get last release data for assembly query_release2 = f"select * from dbsnp_ensembl_species.release_rs_statistics_per_assembly "\ f"where assembly_accession = '{asm}' and release_version = 2" logger.info(query_release2) asm_last_release_data = get_all_results_for_query( metadata_connection_handle, query_release2) # insert data for release 3 taxid = ranges_per_assembly[asm]['taxid'] scientific_name = ranges_per_assembly[asm][ 'scientific_name'].capitalize().replace('_', ' ') folder = f"{ranges_per_assembly[asm]['scientific_name']}/{asm}" release_version = 3 release3_new_remapped_current_rs = metrics_per_assembly[asm][ 'new_remapped_current_rs'] release3_new_clustered_current_rs = metrics_per_assembly[asm][ 'new_clustered_current_rs'] release3_new_current_rs = release3_new_clustered_current_rs + release3_new_remapped_current_rs release3_new_merged_rs = metrics_per_assembly[asm]['merged_rs'] release3_new_split_rs = metrics_per_assembly[asm]['split_rs'] release3_new_ss_clustered = metrics_per_assembly[asm][ 'new_ss_clustered'] insert_query = f"insert into dbsnp_ensembl_species.release_rs_statistics_per_assembly "\ f"(taxonomy_id, scientific_name, assembly_accession, release_folder, release_version, " \ f"current_rs, multi_mapped_rs, merged_rs, deprecated_rs, merged_deprecated_rs, " \ f"new_current_rs, new_multi_mapped_rs, new_merged_rs, new_deprecated_rs, " \ f"new_merged_deprecated_rs, new_ss_clustered, remapped_current_rs, " \ f"new_remapped_current_rs, split_rs, new_split_rs, ss_clustered, clustered_current_rs," \ f"new_clustered_current_rs) " \ f"values ({taxid}, '{scientific_name}', '{asm}', '{folder}', {release_version}, " \ if asm_last_release_data: release2_current_rs = asm_last_release_data[0][5] release2_merged_rs = asm_last_release_data[0][7] release2_multi_mapped_rs = asm_last_release_data[0][6] release2_deprecated_rs = asm_last_release_data[0][8] release2_merged_deprecated_rs = asm_last_release_data[0][9] # get ss clustered query_ss_clustered = f"select sum(new_ss_clustered) " \ f"from dbsnp_ensembl_species.release_rs_statistics_per_assembly " \ f"where assembly_accession = '{asm}'" logger.info(query_ss_clustered) ss_clustered_previous_releases = get_all_results_for_query( metadata_connection_handle, query_ss_clustered) release3_ss_clustered = ss_clustered_previous_releases[0][ 0] + release3_new_ss_clustered # if assembly already existed -> add counts release3_current_rs = release2_current_rs + release3_new_current_rs release3_merged_rs = release2_merged_rs + release3_new_merged_rs # current_rs in previous releases (1 and 2) were all new clustered release3_clustered_current_rs = release2_current_rs + release3_new_clustered_current_rs insert_query_values = f"{release3_current_rs}, " \ f"{release2_multi_mapped_rs}, " \ f"{release3_merged_rs}, " \ f"{release2_deprecated_rs}, " \ f"{release2_merged_deprecated_rs}, " \ f"{release3_new_current_rs}, " \ f"0, " \ f"{release3_new_merged_rs}, " \ f"0, " \ f"0, " \ f"{release3_new_ss_clustered}, " \ f"{release3_new_remapped_current_rs}, " \ f"{release3_new_remapped_current_rs}, " \ f"{release3_new_split_rs}, " \ f"{release3_new_split_rs}, " \ f"{release3_ss_clustered}," \ f"{release3_clustered_current_rs}," \ f"{release3_new_clustered_current_rs})" else: # if new assembly insert_query_values = f"{release3_new_current_rs}, " \ f"0, " \ f"{release3_new_merged_rs}, " \ f"0, " \ f"0, " \ f"{release3_new_current_rs}, " \ f"0, " \ f"{release3_new_merged_rs}, " \ f"0, " \ f"0, " \ f"{release3_new_ss_clustered}, " \ f"{release3_new_remapped_current_rs}, " \ f"{release3_new_remapped_current_rs}, " \ f"{release3_new_split_rs}, " \ f"{release3_new_split_rs}, " \ f"{release3_new_ss_clustered}," \ f"{release3_new_clustered_current_rs}," \ f"{release3_new_clustered_current_rs})" insert_query = f"{insert_query} {insert_query_values}" logger.info(insert_query) execute_query(metadata_connection_handle, insert_query) # get assemblies in from release 1 and 2 not in release 3 assemblies_in_logs = ",".join(f"'{a}'" for a in ranges_per_assembly.keys()) query_missing_assemblies_stats = f"select * " \ f"from dbsnp_ensembl_species.release_rs_statistics_per_assembly " \ f"where release_version = 2 " \ f"and assembly_accession not in ({assemblies_in_logs});" logger.info(query_missing_assemblies_stats) missing_assemblies_stats = get_all_results_for_query( metadata_connection_handle, query_missing_assemblies_stats) for assembly_stats in missing_assemblies_stats: taxonomy_id = assembly_stats[0] scientific_name = assembly_stats[1] assembly_accession = assembly_stats[2] release_folder = assembly_stats[3] current_rs = assembly_stats[5] multi_mapped_rs = assembly_stats[6] merged_rs = assembly_stats[7] deprecated_rs = assembly_stats[8] merged_deprecated_rs = assembly_stats[9] # get ss clustered query_ss_clustered = f"select sum(new_ss_clustered) " \ f"from dbsnp_ensembl_species.release_rs_statistics_per_assembly " \ f"where assembly_accession = '{assembly_accession}'" logger.info(query_ss_clustered) ss_clustered_previous_releases = get_all_results_for_query( metadata_connection_handle, query_ss_clustered) ss_clustered = ss_clustered_previous_releases[0][0] insert_query = f"insert into dbsnp_ensembl_species.release_rs_statistics_per_assembly "\ f"(taxonomy_id, scientific_name, assembly_accession, release_folder, release_version, " \ f"current_rs, multi_mapped_rs, merged_rs, deprecated_rs, merged_deprecated_rs, " \ f"new_current_rs, new_multi_mapped_rs, new_merged_rs, new_deprecated_rs, " \ f"new_merged_deprecated_rs, new_ss_clustered, remapped_current_rs, " \ f"new_remapped_current_rs, split_rs, new_split_rs, ss_clustered, clustered_current_rs, " \ f"new_clustered_current_rs) " \ f"values ({taxonomy_id}, '{scientific_name}', '{assembly_accession}', '{release_folder}', " \ f"3, {current_rs}, {multi_mapped_rs}, {merged_rs}, {deprecated_rs}, " \ f"{merged_deprecated_rs}, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, {ss_clustered}, 0, 0);" logger.info(insert_query) execute_query(metadata_connection_handle, insert_query)