def insert_file_into_evapro(file_dict): filename = os.path.basename(file_dict['filename']) if file_dict['filename'].endswith( '.vcf.gz') or file_dict['filename'].endswith('.vcf'): file_type = 'vcf' elif file_dict['filename'].endswith('.tbi'): file_type = 'tabix' else: raise ValueError('Unsupported file type') ftp_file = 'ftp.sra.ebi.ac.uk/vol1/' + file_dict['filename'] query = ( 'insert into file ' '(filename, file_md5, file_type, file_class, file_version, is_current, file_location, ftp_file) ' f"values ('{filename}', '{file_dict['md5']}', '{file_type}', 'submitted', 1, 1, " f"'scratch_folder', '{ftp_file}')") with get_metadata_connection_handle(cfg['maven']['environment'], cfg['maven']['settings_file']) as conn: logger.info(f'Create file {filename} in the file table') execute_query(conn, query) file_id = get_file_id_from_md5(file_dict['md5']) query = ( f'update file set ena_submission_file_id={file_id} where file_id={file_id}' ) with get_metadata_connection_handle(cfg['maven']['environment'], cfg['maven']['settings_file']) as conn: logger.info(f'Add file id in place of the ena_submission_file_id') execute_query(conn, query) return file_id
def fill_in_table_from_remapping(private_config_xml_file, release_version, reference_directory): query_retrieve_info = ( "select taxonomy, scientific_name, assembly_accession, string_agg(distinct source, ', '), sum(num_ss_ids)" "from eva_progress_tracker.remapping_tracker " f"where release_version={release_version} " "group by taxonomy, scientific_name, assembly_accession") with get_metadata_connection_handle("development", private_config_xml_file) as pg_conn: for taxonomy, scientific_name, assembly_accession, sources, num_ss_id in get_all_results_for_query( pg_conn, query_retrieve_info): if num_ss_id == 0: # Do not release species with no data continue should_be_clustered = True should_be_released = True ncbi_assembly = NCBIAssembly(assembly_accession, scientific_name, reference_directory) fasta_path = ncbi_assembly.assembly_fasta_path report_path = ncbi_assembly.assembly_report_path tempmongo_instance = get_tempmongo_instance(pg_conn, taxonomy) release_folder_name = normalise_taxon_scientific_name( scientific_name) query_insert = ( 'INSERT INTO eva_progress_tracker.clustering_release_tracker ' '(sources, taxonomy, scientific_name, assembly_accession, release_version, should_be_clustered, ' 'fasta_path, report_path, tempmongo_instance, should_be_released, release_folder_name) ' f"VALUES ('{sources}', {taxonomy}, '{scientific_name}', '{assembly_accession}', {release_version}, " f"{should_be_clustered}, '{fasta_path}', '{report_path}', '{tempmongo_instance}', {should_be_released}, " f"'{release_folder_name}') ON CONFLICT DO NOTHING") execute_query(pg_conn, query_insert)
def get_project_alias(project_accession): with get_metadata_connection_handle(cfg['maven']['environment'], cfg['maven']['settings_file']) as conn: query = f"select alias from evapro.project where project_accession='{project_accession}';" rows = get_all_results_for_query(conn, query) if len(rows) != 1: raise ValueError(f'No project alias for {project_accession} found in metadata DB.') return rows[0][0]
def cluster_multiple_from_mongo(taxonomy_id, common_clustering_properties_file, memory, instance, enable_retryable): """ Generates and runs a Nextflow pipeline to cluster all assemblies for a given taxonomy. """ common_properties = get_common_clustering_properties( common_clustering_properties_file) clustering_tracking_table = common_properties["clustering-release-tracker"] release_version = common_properties["release-version"] clustering_folder = common_properties['clustering-folder'] with get_metadata_connection_handle( "production", common_properties["private-config-xml-file"] ) as metadata_connection_handle: assembly_list, scientific_name = get_assemblies_and_scientific_name_from_taxonomy( taxonomy_id, metadata_connection_handle, clustering_tracking_table, release_version) pipeline, output_directory = generate_linear_pipeline( taxonomy_id, scientific_name, assembly_list, common_properties, memory, instance, enable_retryable) pipeline.run_pipeline( workflow_file_path=os.path.join( clustering_folder, f'{taxonomy_id}_clustering_workflow_{timestamp}.nf'), nextflow_binary_path=common_properties['nextflow-binary-path'], nextflow_config_path=common_properties['nextflow-config-path'], working_dir=output_directory)
def get_assemblies_from_evapro(profile, private_config_xml_file): with get_metadata_connection_handle(profile, private_config_xml_file) as pg_conn: query = "select distinct assembly_accession from evapro.accessioned_assembly where assembly_accession like 'GCA%'" \ " union " \ "select distinct assembly_accession from eva_progress_tracker.remapping_tracker where assembly_accession like 'GCA%'" evapro_assemblies = get_all_results_for_query(pg_conn, query) return [asm[0] for asm in evapro_assemblies]
def fill_num_rs_id_for_taxonomy_and_assembly(mongo_source, private_config_xml_file, release_version, taxonomy, reference_directory): assembly_list = get_assembly_list_for_taxonomy(private_config_xml_file, taxonomy, release_version) pipeline = get_filter_group_count_pipeline(taxonomy, assembly_list) cve_res = get_rs_count_per_assembly_in_collection( mongo_source, 'clusteredVariantEntity', pipeline) dbsnp_res = get_rs_count_per_assembly_in_collection( mongo_source, 'dbsnpClusteredVariantEntity', pipeline) with get_metadata_connection_handle("development", private_config_xml_file) as pg_conn: for assembly in assembly_list: sources, rs_count = get_sources_and_rs_count( cve_res, dbsnp_res, assembly) # Skip if there are no rs present for the assembly in any collection if sources == 'None': continue entry_exists = check_if_entry_exists_for_taxonomy_and_assembly( pg_conn, taxonomy, assembly) if entry_exists: update_rs_count_for_taxonomy_assembly(pg_conn, rs_count, taxonomy, assembly) else: insert_new_entry_for_taxonomy_assembly(pg_conn, sources, rs_count, release_version, taxonomy, assembly, reference_directory)
def get_job_information(self, assembly, taxid): query = ( 'SELECT source, scientific_name, assembly_accession, remapping_status, SUM(num_studies), ' 'SUM(num_ss_ids) ' 'FROM eva_progress_tracker.remapping_tracker ' f"WHERE origin_assembly_accession='{assembly}' AND taxonomy='{taxid}' " 'GROUP BY source, origin_assembly_accession, scientific_name, assembly_accession, remapping_status' ) source_set = set() progress_set = set() scientific_name = None target_assembly = None n_study = 0 n_variants = 0 with get_metadata_connection_handle( cfg['maven']['environment'], cfg['maven']['settings_file']) as pg_conn: for source, scientific_name, target_assembly, progress_status, n_st, n_var in get_all_results_for_query( pg_conn, query): source_set.add(source) if progress_status: progress_set.add(progress_status) n_study += n_st n_variants += n_var sources = ', '.join(source_set) if progress_set: progress_status = ', '.join(progress_set) else: progress_status = 'Pending' return sources, scientific_name, target_assembly, progress_status, n_study, n_variants
def get_all_taxonomies_from_eva(private_config_xml_file): taxonomy_list = [] with get_metadata_connection_handle("development", private_config_xml_file) as pg_conn: query = 'SELECT DISTINCT taxonomy_id FROM evapro.taxonomy' for taxonomy in get_all_results_for_query(pg_conn, query): taxonomy_list.append(taxonomy[0]) return taxonomy_list
def get_most_recent_timestamp(private_config_xml_file, ftp_table_name): with get_metadata_connection_handle( 'development', private_config_xml_file) as metadata_connection_handle: results = get_all_results_for_query( metadata_connection_handle, f"select max(event_ts_txt) as recent_ts from {ftp_table_name};") if results and results[0][0]: return results[0][0] return None
def set_status_failed(self, assembly, taxid): query = ( 'UPDATE eva_progress_tracker.remapping_tracker ' f"SET remapping_status = 'Failed' " f"WHERE origin_assembly_accession='{assembly}' AND taxonomy='{taxid}'" ) with get_metadata_connection_handle( cfg['maven']['environment'], cfg['maven']['settings_file']) as pg_conn: execute_query(pg_conn, query)
def get_tax_asm_source_from_eva(private_config_xml_file): eva_tax_asm = {} with get_metadata_connection_handle("development", private_config_xml_file) as pg_conn: query = f"""SELECT DISTINCT taxonomy_id, source, assembly_id FROM {remapping_genome_target_table} WHERE current=TRUE""" for tax_id, source, assembly in get_all_results_for_query( pg_conn, query): eva_tax_asm[tax_id] = {'assembly': assembly, 'source': source} return eva_tax_asm
def process_one_assembly(assembly, resume): # Check the original remapping tracking table for the appropriate taxonomies and target assemblies query = ( 'SELECT taxonomy, scientific_name, assembly_accession ' 'FROM eva_progress_tracker.remapping_tracker ' f"WHERE origin_assembly_accession='{assembly}' " "AND source='DBSNP'" ) with get_metadata_connection_handle(cfg['maven']['environment'], cfg['maven']['settings_file']) as pg_conn: for taxid, scientific_name, target_assembly in get_all_results_for_query(pg_conn, query): process_one_taxonomy(assembly, taxid, scientific_name, target_assembly, resume)
def get_file_id_from_md5(md5): with get_metadata_connection_handle(cfg['maven']['environment'], cfg['maven']['settings_file']) as conn: query = f"select file_id from file where file_md5='{md5}'" rows = get_all_results_for_query(conn, query) file_ids = [file_id for file_id, in rows] if len(file_ids) > 1: raise ValueError(f'Multiple file found with md5 {md5}') elif len(file_ids) == 0: return None return file_ids[0]
def insert_file_analysis_into_evapro(file_dict): query = ( f"insert into analysis_file (ANALYSIS_ACCESSION,FILE_ID) " f"values ({file_dict['file_id']}, '{file_dict['analysis_accession']}')" ) with get_metadata_connection_handle(cfg['maven']['environment'], cfg['maven']['settings_file']) as conn: logger.info( f"Create file {file_dict['file_id']} in the analysis_file table for '{file_dict['analysis_accession']}'" ) execute_query(conn, query)
def remove_file_from_analysis(file_dict): query = ( f"delete from analysis_file " f"where file_id={file_dict['file_id']} and analysis_accession='{file_dict['analysis_accession']}'" ) with get_metadata_connection_handle(cfg['maven']['environment'], cfg['maven']['settings_file']) as conn: logger.info( f"Remove file {file_dict['file_id']} from the analysis_file table for '{file_dict['analysis_accession']}'" ) execute_query(conn, query)
def get_last_release_metric(private_config_xml_file, release_version, taxonomy_id, column_name): query = f"select {column_name} from {species_table_name} " \ f"where release_version={release_version-1} " \ f"and taxonomy_id={taxonomy_id}" with get_metadata_connection_handle('development', private_config_xml_file) as db_conn: results = get_all_results_for_query(db_conn, query) # If this is a new species for this release, won't find anything, so just return 0 if len(results) < 1: return 0 return results[0][0]
def set_clustering_status(private_config_xml_file, clustering_tracking_table, assembly, tax_id, release_version, status): now = datetime.datetime.now().isoformat() update_status_query = f"UPDATE {clustering_tracking_table} " update_status_query += f"SET clustering_status='{status}'" if status == 'Started': update_status_query += f", clustering_start='{now}'" elif status == 'Completed': update_status_query += f", clustering_end='{now}'" update_status_query += (f" WHERE assembly_accession='{assembly}' AND taxonomy='{tax_id}' " f"AND release_version={release_version}") with get_metadata_connection_handle("development", private_config_xml_file) as metadata_connection_handle: execute_query(metadata_connection_handle, update_status_query)
def write_counts_to_table(private_config_xml_file, counts): all_columns = counts[0].keys() all_values = [ f"({','.join(str(species_counts[c]) for c in all_columns)})" for species_counts in counts ] insert_query = f"insert into {species_table_name} " \ f"({','.join(all_columns)}) " \ f"values {','.join(all_values)}" with get_metadata_connection_handle('development', private_config_xml_file) as db_conn: execute_query(db_conn, insert_query)
def get_assembly_list_for_taxonomy(private_config_xml_file, taxonomy, release_version): assembly_list = set() with get_metadata_connection_handle("production", private_config_xml_file) as pg_conn: query = f'SELECT assembly_accession from evapro.assembly where taxonomy_id = {taxonomy}' for assembly in get_all_results_for_query(pg_conn, query): assembly_list.add(assembly[0]) query = (f"SELECT distinct assembly_accession " f"from eva_progress_tracker.clustering_release_tracker " f"where taxonomy={taxonomy} and release_version={release_version} and assembly_accession!='Unmapped'") for assembly in get_all_results_for_query(pg_conn, query): assembly_list.add(assembly[0]) return list(assembly_list)
def create_stats_table(private_config_xml_file, ftp_table_name): with get_metadata_connection_handle( 'development', private_config_xml_file) as metadata_connection_handle: query_create_table = ( f'CREATE TABLE IF NOT EXISTS {ftp_table_name} ' '(_index TEXT, _id TEXT, event_ts_txt TEXT, event_ts TIMESTAMP, host TEXT, uhost TEXT,' ' request_time TEXT, request_year INTEGER, request_ts TIMESTAMP,' ' file_name TEXT, file_size BIGINT, transfer_time INTEGER,' ' transfer_type CHAR, direction CHAR, special_action CHAR(4), access_mode CHAR,' ' country CHAR(2), region TEXT, city TEXT, domain_name TEXT, isp TEXT, usage_type TEXT,' ' primary key(_index, _id))') execute_query(metadata_connection_handle, query_create_table)
def set_counts(self, assembly, taxid, source, nb_variant_extracted=None, nb_variant_remapped=None, nb_variant_ingested=None): set_statements = [] query = ( f"SELECT * FROM eva_progress_tracker.remapping_tracker " f"WHERE origin_assembly_accession='{assembly}' AND taxonomy='{taxid}' AND source='{source}'" ) with get_metadata_connection_handle( cfg['maven']['environment'], cfg['maven']['settings_file']) as pg_conn: # Check that this row exists results = get_all_results_for_query(pg_conn, query) if results: if nb_variant_extracted is not None: set_statements.append( f"num_ss_extracted = {nb_variant_extracted}") if nb_variant_remapped is not None: set_statements.append( f"num_ss_remapped = {nb_variant_remapped}") if nb_variant_ingested is not None: set_statements.append( f"num_ss_ingested = {nb_variant_ingested}") if set_statements: query = ( 'UPDATE eva_progress_tracker.remapping_tracker ' 'SET ' + ', '.join(set_statements) + ' ' f"WHERE origin_assembly_accession='{assembly}' AND taxonomy='{taxid}' AND source='{source}'" ) with get_metadata_connection_handle( cfg['maven']['environment'], cfg['maven']['settings_file']) as pg_conn: execute_query(pg_conn, query)
def get_accession_counts_per_assembly(private_config_xml_file, source): accession_count = {} with get_metadata_connection_handle("development", private_config_xml_file) as pg_conn: query = ( 'SELECT assembly_accession, taxid, SUM(number_submitted_variants) ' 'FROM eva_stats.submitted_variants_load_counts ' "WHERE source='%s'" 'GROUP BY assembly_accession, taxid ' % source) for assembly_accession, taxid, count_ssid in get_all_results_for_query( pg_conn, query): accession_count[assembly_accession] = (assembly_accession, taxid, count_ssid) return accession_count
def get_taxonomy_and_scientific_name(private_config_xml_file, release_version, species_folder): query = f"select taxonomy, scientific_name from {tracker_table_name} " \ f"where release_version={release_version} " \ f"and release_folder_name='{species_folder}' " \ f"and should_be_released" with get_metadata_connection_handle('development', private_config_xml_file) as db_conn: results = get_all_results_for_query(db_conn, query) if len(results) < 1: logger.warning( f'Failed to get scientific name and taxonomy for {species_folder}') return None, None return results[0][0], results[0][1]
def create_table_for_progress(private_config_xml_file): with get_metadata_connection_handle( "development", private_config_xml_file) as metadata_connection_handle: query_create_table = ( 'CREATE TABLE IF NOT EXISTS eva_progress_tracker.remapping_tracker ' '(source TEXT, taxonomy INTEGER, scientific_name TEXT, origin_assembly_accession TEXT, num_studies INTEGER NOT NULL,' 'num_ss_ids BIGINT NOT NULL, release_version INTEGER, assembly_accession TEXT, ' 'remapping_report_time TIMESTAMP DEFAULT NOW(), remapping_status TEXT, remapping_start TIMESTAMP, ' 'remapping_end TIMESTAMP, remapping_version TEXT, num_ss_extracted INTEGER, ' 'num_ss_remapped INTEGER, num_ss_ingested INTEGER, ' 'primary key(source, taxonomy, origin_assembly_accession, release_version))' ) execute_query(metadata_connection_handle, query_create_table)
def insert_remapping_progress_to_db(private_config_xml_file, dataframe): list_to_remap = dataframe.values.tolist() if len(list_to_remap) > 0: with get_metadata_connection_handle( "development", private_config_xml_file) as metadata_connection_handle: with metadata_connection_handle.cursor() as cursor: query_insert = ( 'INSERT INTO eva_progress_tracker.remapping_tracker ' '(source, taxonomy, scientific_name, origin_assembly_accession, num_studies, ' 'num_ss_ids, assembly_accession, release_version) ' 'VALUES %s') psycopg2.extras.execute_values(cursor, query_insert, list_to_remap)
def fill_in_from_previous_inventory(private_config_xml_file, release_version): query = ("select taxonomy_id, scientific_name, assembly, sources, total_num_variants, release_folder_name " "from dbsnp_ensembl_species.release_species_inventory where sources='DBSNP - filesystem' and release_version=2") with get_metadata_connection_handle("production", private_config_xml_file) as pg_conn: for taxonomy, scientific_name, assembly, sources, total_num_variants, release_folder_name in get_all_results_for_query(pg_conn, query): should_be_clustered = False should_be_released = False query_insert = ( 'INSERT INTO eva_progress_tracker.clustering_release_tracker ' '(sources, taxonomy, scientific_name, assembly_accession, release_version, should_be_clustered, ' 'should_be_released, release_folder_name) ' f"VALUES ('{sources}', {taxonomy}, '{scientific_name}', '{assembly}', {release_version}, " f"{should_be_clustered}, {should_be_released}, '{release_folder_name}') ON CONFLICT DO NOTHING" ) execute_query(pg_conn, query_insert)
def required_contigs(self): """Return list of dict retrieve from the eva_tasks.eva2469_contig_analysis table.""" self.info('Retrieve required contigs from database') with get_metadata_connection_handle( cfg['maven']['environment'], cfg['maven']['settings_file']) as pg_conn: query = ( "select distinct contig_accession,refseq_contig_from_equiv_table from eva_tasks.eva2469_contig_analysis " "where source_table in ('dbsnpSubmittedVariantEntity', 'submittedVariantEntity') " f"and assembly_accession='{self.assembly_accession}'") return [ dict([('genbank', genbank_accession.strip() if genbank_accession else ''), ('refseq', refseq_accession.strip() if refseq_accession else '')]) for genbank_accession, refseq_accession in get_all_results_for_query(pg_conn, query) ]
def get_new_ss_clustered(private_config_xml_file, release_version, taxonomy_id): query = f"select new_ss_clustered from {assembly_table_name} " \ f"where release_version={release_version} " \ f"and taxonomy_id={taxonomy_id} " \ f"and new_ss_clustered > 0" with get_metadata_connection_handle('development', private_config_xml_file) as db_conn: results = get_all_results_for_query(db_conn, query) if len(results) > 1: raise ValueError( f'Should have exactly one assembly for taxonomy {taxonomy_id} with new clustered ss, ' f'instead found {len(results)}') elif len(results) == 0: logger.warning( f'No assemblies found with new clustered ss for taxonomy {taxonomy_id}' ) return 0 return results[0][0]
def list_assemblies_to_process(self): query = 'SELECT DISTINCT origin_assembly_accession, taxonomy FROM eva_progress_tracker.remapping_tracker' header = [ 'Sources', 'Scientific_name', 'Assembly', 'Taxonom ID', 'Target Assembly', 'Progress Status', 'Numb Of Study', 'Numb Of Variants' ] rows = [] with get_metadata_connection_handle( cfg['maven']['environment'], cfg['maven']['settings_file']) as pg_conn: for assembly, taxid in get_all_results_for_query(pg_conn, query): sources, scientific_name, target_assembly, progress_status, n_study, n_variants = \ self.get_job_information(assembly, taxid) rows.append([ sources, scientific_name, assembly, taxid, target_assembly, progress_status, n_study, n_variants ]) # print('\t'.join(str(e) for e in [sources, scientific_name, assembly, taxid, target_assembly, progress_status, n_study, n_variants])) pretty_print(header, rows)
def difference_evapro_file_set_with_ena_for_analysis(analysis_accession, ena_list_of_file_dicts): query = f"select f.file_name, f.file_md5 " \ f"from analysis_file af join file f on af.file_id=f.file_id " \ f"where af.analysis_accession='{analysis_accession}';" with get_metadata_connection_handle(cfg['maven']['environment'], cfg['maven']['settings_file']) as conn: eva_list_of_file_dicts = [{ 'filename': fn, 'md5': md5 } for fn, md5 in get_all_results_for_query(conn, query)] set_of_file_from_ena = set([d['md5'] for d in ena_list_of_file_dicts]) set_of_file_from_eva_pro = set( [d['md5'] for d in eva_list_of_file_dicts]) if set_of_file_from_ena != set_of_file_from_eva_pro: logger.warn( f'File for analysis {analysis_accession} are different in ENA and EVA' ) file_specific_to_ena = set_of_file_from_ena.difference( set_of_file_from_eva_pro) file_specific_to_eva = set_of_file_from_eva_pro.difference( set_of_file_from_ena) file_dict_specific_to_ena = [ file_dict for file_dict in ena_list_of_file_dicts if file_dict['md5'] in file_specific_to_ena ] file_dict_specific_to_eva = [ file_dict for file_dict in eva_list_of_file_dicts if file_dict['md5'] in file_specific_to_eva ] for file_dict in file_dict_specific_to_ena: logger.warn( f"File {file_dict['filename']} exist in ENA but not in EVA." ) for file_dict in file_dict_specific_to_eva: logger.warn( f"File {file_dict['filename']} exist in EVA but not in ENA." ) return file_dict_specific_to_ena, file_dict_specific_to_eva return [], []