def main(assembly_accession, species_name, output_directory, private_config_file, clear): private_config_args = get_args_from_private_config_file( private_config_file) eutils_api_key = private_config_args['eutils_api_key'] assembly = NCBIAssembly(assembly_accession, species_name, output_directory, eutils_api_key=eutils_api_key) assembly.download_or_construct(overwrite=clear)
def get_genome_fasta_and_report(species_name, assembly_accession, output_directory=None, overwrite=False): output_directory = output_directory or cfg.query('genome_downloader', 'output_directory') assembly = NCBIAssembly(assembly_accession, species_name, output_directory, eutils_api_key=cfg['eutils_api_key']) if not os.path.isfile(assembly.assembly_fasta_path) or not os.path.isfile( assembly.assembly_report_path) or overwrite: assembly.download_or_construct(overwrite=overwrite) return assembly.assembly_fasta_path, assembly.assembly_report_path
def download_assembly(scientific_name, assembly_accession, download_dir, assembly_report=None): private_json = os.path.join(eva_accession_path, "private-config.json") with open(private_json) as private_config_file_handle: config = json.load(private_config_file_handle) eutils_api_key = config['eutils_api_key'] assembly = NCBIAssembly(assembly_accession, scientific_name, download_dir, eutils_api_key) if assembly_report: shutil.copyfile(assembly_report, assembly.assembly_report_path) assembly.download_or_construct() return assembly.assembly_fasta_path, assembly.assembly_report_path
def fill_in_table_from_remapping(private_config_xml_file, release_version, reference_directory): query_retrieve_info = ( "select taxonomy, scientific_name, assembly_accession, string_agg(distinct source, ', '), sum(num_ss_ids)" "from eva_progress_tracker.remapping_tracker " f"where release_version={release_version} " "group by taxonomy, scientific_name, assembly_accession") with get_metadata_connection_handle("development", private_config_xml_file) as pg_conn: for taxonomy, scientific_name, assembly_accession, sources, num_ss_id in get_all_results_for_query( pg_conn, query_retrieve_info): if num_ss_id == 0: # Do not release species with no data continue should_be_clustered = True should_be_released = True ncbi_assembly = NCBIAssembly(assembly_accession, scientific_name, reference_directory) fasta_path = ncbi_assembly.assembly_fasta_path report_path = ncbi_assembly.assembly_report_path tempmongo_instance = get_tempmongo_instance(pg_conn, taxonomy) release_folder_name = normalise_taxon_scientific_name( scientific_name) query_insert = ( 'INSERT INTO eva_progress_tracker.clustering_release_tracker ' '(sources, taxonomy, scientific_name, assembly_accession, release_version, should_be_clustered, ' 'fasta_path, report_path, tempmongo_instance, should_be_released, release_folder_name) ' f"VALUES ('{sources}', {taxonomy}, '{scientific_name}', '{assembly_accession}', {release_version}, " f"{should_be_clustered}, '{fasta_path}', '{report_path}', '{tempmongo_instance}', {should_be_released}, " f"'{release_folder_name}') ON CONFLICT DO NOTHING") execute_query(pg_conn, query_insert)
def insert_new_entry_for_taxonomy_assembly(pg_conn, sources, rs_count, release_version, taxonomy, assembly, reference_directory): logger.info(f'inserting rs count({rs_count}) for taxonomy({taxonomy}) and assembly({assembly})') scientific_name = get_scientific_name(pg_conn, taxonomy) release_folder_name = normalise_taxon_scientific_name(scientific_name) ncbi_assembly = NCBIAssembly(assembly, scientific_name, reference_directory) fasta_path = ncbi_assembly.assembly_fasta_path report_path = ncbi_assembly.assembly_report_path tempmongo_instance = get_tempmongo_instance(pg_conn, taxonomy) should_be_clustered = False should_be_released = True query_insert = ( 'INSERT INTO eva_progress_tracker.clustering_release_tracker ' '(sources, taxonomy, scientific_name, assembly_accession, release_version, should_be_clustered, ' 'fasta_path, report_path, tempmongo_instance, should_be_released, release_folder_name) ' f"VALUES ('{sources}', {taxonomy}, '{scientific_name}', '{assembly}', {release_version}, " f"{should_be_clustered}, '{fasta_path}', '{report_path}', '{tempmongo_instance}', {should_be_released}, " f"'{release_folder_name}') ON CONFLICT DO NOTHING") execute_query(pg_conn, query_insert)
def _does_contig_exist_in_assembly(contig_accession: str, assembly_accession: str): logger.info(f"Obtaining assembly report for {assembly_accession}...") asm = NCBIAssembly(assembly_accession, species_scientific_name=None, reference_directory=None) try: assembly_report_file_name = os.path.basename(asm.assembly_report_url) os.system("rm -f " + assembly_report_file_name) wget.download(asm.assembly_report_url) output = run_command_with_output( f"Checking if contig {contig_accession} exists in assembly {assembly_accession}", f'grep -w "{contig_accession}" "{assembly_report_file_name}" | cat', return_process_output=True) return output.strip() != "" except Exception as ex: logger.error( f"Could not download assembly report for {assembly_accession} due to: " + ex.__str__()) return False
def collect_assembly_report_genbank_contigs(private_config_xml_file, assembly_accession): try: with psycopg2.connect(get_pg_metadata_uri_for_eva_profile("development", private_config_xml_file), user="******") \ as metadata_connection_handle: asm = NCBIAssembly(assembly_accession, species_scientific_name=None, reference_directory=None) assembly_report_file_name = os.path.basename( asm.assembly_report_url) os.system("rm -f " + assembly_report_file_name) wget.download(asm.assembly_report_url) insert_chunk_size = 100 contig_info_list = [] for line in open(assembly_report_file_name, 'r'): if not line.strip().startswith("#"): line_components = line.strip().split("\t") chromosome_name, genbank_accession, accession_equivalence, refseq_accession = \ line_components[0], line_components[4], line_components[5], line_components[6] # Equivalence "Relationship" column in the assembly report indicates if # Genbank and RefSeq contig accessions are equivalent is_equivalent_genbank_available = ( accession_equivalence.strip() == "=") contig_info_list.append( (assembly_accession, genbank_accession, chromosome_name, is_equivalent_genbank_available, refseq_accession)) if len(contig_info_list) == insert_chunk_size: insert_contigs_to_db(metadata_connection_handle, contig_info_list) contig_info_list = [] insert_contigs_to_db(metadata_connection_handle, contig_info_list) except Exception: logger.error(traceback.format_exc())
def get_assembly_report_url(assembly_accession): return NCBIAssembly(assembly_accession).assembly_report_url