def construct_asm_lookup_table(metadata_db_name, metadata_db_user,
                               metadata_db_host):
    with get_pg_connection_handle(
            metadata_db_name, metadata_db_user,
            metadata_db_host) as metadata_connection_handle:
        asm_lookup_table_name = "dbsnp_ensembl_species.EVA2015_snpmapinfo_asm_lookup"
        create_asm_lookup_table(metadata_connection_handle,
                                asm_lookup_table_name)
        for species_info in get_species_info(metadata_connection_handle):
            for snpmapinfo_table_name in get_snpmapinfo_table_names_for_species(
                    species_info):
                distinct_asm, type_of_asm = get_distinct_asm_with_overweight_snps_in_snpmapinfo_table\
                    (snpmapinfo_table_name, species_info)
                for asm in distinct_asm:
                    resolved_GCA_accession = resolve_asm_to_GCA_accession(
                        asm, type_of_asm)
                    query = "insert into {0} values ('{1}', '{2}', '{3}', '{4}') " \
                            "on conflict on constraint unique_constraint do nothing"\
                        .format(asm_lookup_table_name
                                , species_info["database_name"]
                                , snpmapinfo_table_name
                                , asm
                                , resolved_GCA_accession)
                    logger.info("Executing query: " + query)
                    execute_query(metadata_connection_handle, query)
                    metadata_connection_handle.commit()
def fill_in_table_from_remapping(private_config_xml_file, release_version,
                                 reference_directory):
    query_retrieve_info = (
        "select taxonomy, scientific_name, assembly_accession, string_agg(distinct source, ', '), sum(num_ss_ids)"
        "from eva_progress_tracker.remapping_tracker "
        f"where release_version={release_version} "
        "group by taxonomy, scientific_name, assembly_accession")
    with get_metadata_connection_handle("development",
                                        private_config_xml_file) as pg_conn:
        for taxonomy, scientific_name, assembly_accession, sources, num_ss_id in get_all_results_for_query(
                pg_conn, query_retrieve_info):
            if num_ss_id == 0:
                # Do not release species with no data
                continue

            should_be_clustered = True
            should_be_released = True
            ncbi_assembly = NCBIAssembly(assembly_accession, scientific_name,
                                         reference_directory)
            fasta_path = ncbi_assembly.assembly_fasta_path
            report_path = ncbi_assembly.assembly_report_path
            tempmongo_instance = get_tempmongo_instance(pg_conn, taxonomy)
            release_folder_name = normalise_taxon_scientific_name(
                scientific_name)
            query_insert = (
                'INSERT INTO eva_progress_tracker.clustering_release_tracker '
                '(sources, taxonomy, scientific_name, assembly_accession, release_version, should_be_clustered, '
                'fasta_path, report_path, tempmongo_instance, should_be_released, release_folder_name) '
                f"VALUES ('{sources}', {taxonomy}, '{scientific_name}', '{assembly_accession}', {release_version}, "
                f"{should_be_clustered}, '{fasta_path}', '{report_path}', '{tempmongo_instance}', {should_be_released}, "
                f"'{release_folder_name}') ON CONFLICT DO NOTHING")
            execute_query(pg_conn, query_insert)
예제 #3
0
def get_dates_from_variant_warehouse(assembly, project, analysis_filename,
                                     metadata_handle, mongo_handle,
                                     database_name):
    logger.info("Database name to query: {0}".format(database_name))
    logger.info(
        "Getting counts from files collection for {0}, {1}, {2}".format(
            assembly, project, analysis_filename))
    files_collection = mongo_handle[database_name]['files_2_0']

    where_clause = []
    for analysis, filename in analysis_filename.items():
        where = {'sid': project, 'fid': analysis, 'fname': filename}
        where_clause.append(where)
    cursor_files = files_collection.find({"$or": where_clause}, {
        "fid": 1,
        "sid": 1,
        "date": 1,
        "_id": 0
    })

    for dates in cursor_files:
        analysis = dates['fid']
        date = dates['date']
        logger.info("Update counts for {0}, {1}, {2}".format(
            assembly, project, analysis))
        update_date_query = "update eva_stats.stats " \
                            "set date_processed = '{3}'" \
                            "where assembly_accession = '{0}'" \
                            "and project_accession = '{1}'" \
                            "and analysis_accession = '{2}'".format(assembly, project, analysis, date)
        execute_query(metadata_handle, update_date_query)
예제 #4
0
 def update_assembly_set_in_analysis(self):
     taxonomy = self.eload_cfg.query('submission', 'taxonomy_id')
     analyses = self.eload_cfg.query('submission', 'analyses')
     with self.metadata_connection_handle as conn:
         for analysis_alias, analysis_data in analyses.items():
             assembly_accession = analysis_data['assembly_accession']
             assembly_set_id = get_assembly_set(conn, taxonomy,
                                                assembly_accession)
             analysis_accession = self.eload_cfg.query(
                 'brokering', 'ena', 'ANALYSIS', analysis_alias)
             # Check if the update is needed
             check_query = (
                 f"select assembly_set_id from evapro.analysis "
                 f"where analysis_accession = '{analysis_accession}';")
             res = get_all_results_for_query(conn, check_query)
             if res and res[0][0] != assembly_set_id:
                 if res[0][0]:
                     self.error(
                         f'Previous assembly_set_id {res[0][0]} for {analysis_accession} was wrong and '
                         f'will be updated to {assembly_set_id}')
                 analysis_update = (
                     f"update evapro.analysis "
                     f"set assembly_set_id = '{assembly_set_id}' "
                     f"where analysis_accession = '{analysis_accession}';")
                 execute_query(conn, analysis_update)
def create_multimap_snp_table_and_indices(metadata_connection_handle,
                                          dbsnp_species_name, species_info):
    union_of_snpmapinfo_tables_query = " union all ".join([
        "select snp_id, weight, {0} as assembly from dbsnp_{1}.{2} where weight > 1"
        .format(
            "||'.'||".join(get_snpmapinfo_asm_columns(species_info,
                                                      table_name)),
            dbsnp_species_name, table_name) for table_name in
        get_snpmapinfo_tables_with_overweight_snps_for_dbsnp_species(
            metadata_connection_handle, dbsnp_species_name)
    ])
    if len(union_of_snpmapinfo_tables_query) > 0:
        multimap_snp_table_name = "multimap_snps"
        table_creation_query = """            
                create table if not exists dbsnp_{0}.{1} as (select distinct * from ({2}) temp);
                """.format(dbsnp_species_name, multimap_snp_table_name,
                           union_of_snpmapinfo_tables_query)
        with get_db_conn_for_species(
                species_info) as species_connection_handle:
            logger.info("Executing query: " + table_creation_query)
            execute_query(species_connection_handle, table_creation_query)
            for column in ["snp_id", "weight", "assembly"]:
                create_index_on_table(species_connection_handle,
                                      "dbsnp_" + dbsnp_species_name,
                                      multimap_snp_table_name, [column])
                vacuum_analyze_table(species_connection_handle,
                                     "dbsnp_" + dbsnp_species_name,
                                     multimap_snp_table_name, [column])
                execute_query(
                    species_connection_handle,
                    "grant select on dbsnp_{0}.{1} to dbsnp_ro".format(
                        dbsnp_species_name, multimap_snp_table_name))
예제 #6
0
def insert_file_into_evapro(file_dict):
    filename = os.path.basename(file_dict['filename'])
    if file_dict['filename'].endswith(
            '.vcf.gz') or file_dict['filename'].endswith('.vcf'):
        file_type = 'vcf'
    elif file_dict['filename'].endswith('.tbi'):
        file_type = 'tabix'
    else:
        raise ValueError('Unsupported file type')
    ftp_file = 'ftp.sra.ebi.ac.uk/vol1/' + file_dict['filename']

    query = (
        'insert into file '
        '(filename, file_md5, file_type, file_class, file_version, is_current, file_location, ftp_file) '
        f"values ('{filename}', '{file_dict['md5']}', '{file_type}', 'submitted', 1, 1, "
        f"'scratch_folder', '{ftp_file}')")
    with get_metadata_connection_handle(cfg['maven']['environment'],
                                        cfg['maven']['settings_file']) as conn:
        logger.info(f'Create file {filename} in the file table')
        execute_query(conn, query)
    file_id = get_file_id_from_md5(file_dict['md5'])
    query = (
        f'update file set ena_submission_file_id={file_id} where file_id={file_id}'
    )
    with get_metadata_connection_handle(cfg['maven']['environment'],
                                        cfg['maven']['settings_file']) as conn:
        logger.info(f'Add file id in place of the ena_submission_file_id')
        execute_query(conn, query)
    return file_id
예제 #7
0
 def insert_browsable_files(self):
     with self.metadata_connection_handle as conn:
         # insert into browsable file table, if files not already there
         files_query = (
             f"select file_id, ena_submission_file_id,filename,project_accession,assembly_set_id "
             f"from evapro.browsable_file "
             f"where project_accession = '{self.project_accession}';")
         rows_in_table = get_all_results_for_query(conn, files_query)
         find_browsable_files_query = (
             "select file.file_id,ena_submission_file_id,filename,project_accession,assembly_set_id "
             "from (select * from analysis_file af "
             "join analysis a on a.analysis_accession = af.analysis_accession "
             "join project_analysis pa on af.analysis_accession = pa.analysis_accession "
             f"where pa.project_accession = '{self.project_accession}' ) myfiles "
             "join file on file.file_id = myfiles.file_id where file.file_type ilike 'vcf';"
         )
         rows_expected = get_all_results_for_query(conn, files_query)
         if len(rows_in_table) > 0:
             if set(rows_in_table) == set(rows_expected):
                 self.info('Browsable files already inserted, skipping')
             else:
                 self.warning(
                     f'Found {len(rows_in_table)} browsable file rows in the table but they are different '
                     f'from the expected ones: '
                     f'{os.linesep + os.linesep.join([str(row) for row in rows_expected])}'
                 )
         else:
             self.info('Inserting browsable files...')
             insert_query = (
                 "insert into browsable_file (file_id,ena_submission_file_id,filename,project_accession,"
                 "assembly_set_id) " + find_browsable_files_query)
             execute_query(conn, insert_query)
def create_table_to_collect_assembly_report_genbank_contigs(private_config_xml_file):    
    with psycopg2.connect(get_pg_metadata_uri_for_eva_profile("development", private_config_xml_file), user="******") \
            as metadata_connection_handle:
        create_table_to_store_asm_report_contigs_query = "create table if not exists {0} " \
                                                         "(assembly_accession text, contig_accession text, " \
                                                         "chromosome_name text)".format(asm_report_contigs_table_name)

        execute_query(metadata_connection_handle, create_table_to_store_asm_report_contigs_query)
예제 #9
0
def get_counts_from_variant_warehouse(assembly, project, analyses,
                                      metadata_handle, mongo_handle,
                                      database_name):
    logger.info("Database name to query: {0}".format(database_name))
    logger.info(
        "Getting counts from variants collection for {0}, {1}, {2}".format(
            assembly, project, analyses))
    variants_collection = mongo_handle[database_name]['variants_2_0']
    pipeline = [{
        "$match": {
            "files.sid": project,
            "files.fid": {
                "$in": analyses
            }
        }
    }, {
        "$project": {
            "_id": 0,
            "files.fid": 1
        }
    }, {
        "$unwind": "$files"
    }, {
        "$unwind": "$files.fid"
    }, {
        "$match": {
            "files.fid": {
                "$in": analyses
            }
        }
    }, {
        "$group": {
            "_id": "$files.fid",
            "count": {
                "$sum": 1
            }
        }
    }, {
        "$project": {
            "_id": 0,
            "files.fid": "$_id",
            "count": 1
        }
    }]
    cursor_variants = variants_collection.aggregate(pipeline=pipeline,
                                                    allowDiskUse=True)
    for stat in cursor_variants:
        count = stat['count']
        analysis = stat['files']['fid']
        logger.info("Update counts for {0}, {1}, {2}".format(
            assembly, project, analysis))
        update_counts_query = "update eva_stats.stats " \
                              "set variants_variant_warehouse = {3}" \
                              "where assembly_accession = '{0}'" \
                              "and project_accession = '{1}'" \
                              "and analysis_accession = '{2}'".format(assembly, project, analysis, count)
        execute_query(metadata_handle, update_counts_query)
예제 #10
0
 def update_browsable_files_with_date(self):
     with self.metadata_connection_handle as conn:
         # update loaded and release date
         release_date = self.eload_cfg.query('brokering', 'ena',
                                             'hold_date')
         release_update = f"update evapro.browsable_file " \
                          f"set loaded = true, eva_release = '{release_date.strftime('%Y%m%d')}' " \
                          f"where project_accession = '{self.project_accession}';"
         execute_query(conn, release_update)
def create_asm_lookup_table(metadata_connection_handle, asm_lookup_table_name):
    asm_lookup_table_creation_query = "create table if not exists " \
                                      "{0} " \
                                      "(database_name text, snpmapinfo_table_name text, " \
                                      "assembly text, assembly_accession text, " \
                                      "constraint unique_constraint unique(database_name, snpmapinfo_table_name, " \
                                        "assembly))".format(asm_lookup_table_name)
    execute_query(metadata_connection_handle, asm_lookup_table_creation_query)
    metadata_connection_handle.commit()
 def set_status_failed(self, assembly, taxid):
     query = (
         'UPDATE eva_progress_tracker.remapping_tracker '
         f"SET remapping_status = 'Failed' "
         f"WHERE origin_assembly_accession='{assembly}' AND taxonomy='{taxid}'"
     )
     with get_metadata_connection_handle(
             cfg['maven']['environment'],
             cfg['maven']['settings_file']) as pg_conn:
         execute_query(pg_conn, query)
def update_rs_count_for_taxonomy_assembly(pg_conn, rs_count, taxonomy,
                                          assembly):
    logger.info(
        f'updating rs count({rs_count}) for taxonomy({taxonomy}) and assembly({assembly})'
    )
    query_update = (
        f"""UPDATE eva_progress_tracker.clustering_release_tracker SET num_rs_to_release={rs_count}
                        WHERE taxonomy={taxonomy} and assembly_accession='{assembly}'"""
    )
    execute_query(pg_conn, query_update)
예제 #14
0
def create_table_to_collect_possible_assemblies(private_config_xml_file):
    with psycopg2.connect(get_pg_metadata_uri_for_eva_profile("development", private_config_xml_file), user="******") \
            as metadata_connection_handle:
        create_table_to_store_possible_assemblies_query = "create table if not exists {0} " \
                                                         "(genbank_accession text, assembly_accession text, " \
                                                          "primary key (genbank_accession, assembly_accession))"\
            .format(possible_assemblies_table_name)

        execute_query(metadata_connection_handle,
                      create_table_to_store_possible_assemblies_query)
예제 #15
0
def create_table_accession_counts(private_config_xml_file):
    with psycopg2.connect(get_pg_metadata_uri_for_eva_profile(
            "development", private_config_xml_file),
                          user="******") as metadata_connection_handle:
        query_create_table = (
            'CREATE TABLE IF NOT EXISTS eva_stats.submitted_variants_load_counts '
            '(source TEXT, taxid INTEGER, assembly_accession TEXT, project_accession TEXT, date_loaded TIMESTAMP, '
            'number_submitted_variants BIGINT NOT NULL, '
            'primary key(taxid, assembly_accession, project_accession, date_loaded))'
        )
    execute_query(metadata_connection_handle, query_create_table)
예제 #16
0
def insert_file_analysis_into_evapro(file_dict):
    query = (
        f"insert into analysis_file (ANALYSIS_ACCESSION,FILE_ID) "
        f"values ({file_dict['file_id']}, '{file_dict['analysis_accession']}')"
    )
    with get_metadata_connection_handle(cfg['maven']['environment'],
                                        cfg['maven']['settings_file']) as conn:
        logger.info(
            f"Create file {file_dict['file_id']} in the analysis_file table for '{file_dict['analysis_accession']}'"
        )
        execute_query(conn, query)
예제 #17
0
def remove_file_from_analysis(file_dict):
    query = (
        f"delete from analysis_file "
        f"where file_id={file_dict['file_id']} and analysis_accession='{file_dict['analysis_accession']}'"
    )
    with get_metadata_connection_handle(cfg['maven']['environment'],
                                        cfg['maven']['settings_file']) as conn:
        logger.info(
            f"Remove file {file_dict['file_id']} from the analysis_file table for '{file_dict['analysis_accession']}'"
        )
        execute_query(conn, query)
예제 #18
0
def create_table_for_count_validation(private_config_xml_file):
    with psycopg2.connect(get_pg_metadata_uri_for_eva_profile(
            "development", private_config_xml_file),
                          user="******") as metadata_connection_handle:
        query_create_table_for_count_validation = "create table if not exists {0} " \
                                                  "(mongo_host text, database text, collection text, " \
                                                  "document_count bigint not null, report_time timestamp, " \
                                                  "primary key(mongo_host, database, collection, report_time))" \
            .format(mongo_migration_count_validation_table_name)

    execute_query(metadata_connection_handle,
                  query_create_table_for_count_validation)
def write_counts_to_table(private_config_xml_file, counts):
    all_columns = counts[0].keys()
    all_values = [
        f"({','.join(str(species_counts[c]) for c in all_columns)})"
        for species_counts in counts
    ]
    insert_query = f"insert into {species_table_name} " \
                   f"({','.join(all_columns)}) " \
                   f"values {','.join(all_values)}"
    with get_metadata_connection_handle('development',
                                        private_config_xml_file) as db_conn:
        execute_query(db_conn, insert_query)
예제 #20
0
def set_clustering_status(private_config_xml_file, clustering_tracking_table, assembly, tax_id, release_version, status):
    now = datetime.datetime.now().isoformat()
    update_status_query = f"UPDATE {clustering_tracking_table} "
    update_status_query += f"SET clustering_status='{status}'"
    if status == 'Started':
        update_status_query += f", clustering_start='{now}'"
    elif status == 'Completed':
        update_status_query += f", clustering_end='{now}'"
    update_status_query += (f" WHERE assembly_accession='{assembly}' AND taxonomy='{tax_id}' "
                            f"AND release_version={release_version}")
    with get_metadata_connection_handle("development", private_config_xml_file) as metadata_connection_handle:
        execute_query(metadata_connection_handle, update_status_query)
def create_table_to_collect_mongo_genbank_contigs(private_config_xml_file):
    with psycopg2.connect(get_pg_metadata_uri_for_eva_profile("development", private_config_xml_file), user="******") \
            as metadata_connection_handle:
        create_table_to_store_asm_report_contigs_query = "create table if not exists {0} " \
                                                         "(source text, assembly_accession text, " \
                                                         "study text, contig_accession text, chromosome_name text, " \
                                                         "num_entries_in_db bigint, is_contig_in_asm_report boolean, " \
                                                         "primary key(source, assembly_accession, study, " \
                                                         "contig_accession))"\
            .format(mongo_genbank_contigs_table_name)

        execute_query(metadata_connection_handle,
                      create_table_to_store_asm_report_contigs_query)
def create_table_for_progress(private_config_xml_file):
    with psycopg2.connect(get_pg_metadata_uri_for_eva_profile(
            "development", private_config_xml_file),
                          user="******") as metadata_connection_handle:
        query_create_table = (
            'CREATE TABLE IF NOT EXISTS remapping_progress '
            '(source TEXT, taxid INTEGER, scientific_name TEXT, assembly_accession TEXT, number_of_study INTEGER NOT NULL,'
            'number_submitted_variants BIGINT NOT NULL, release_number INTEGER, `target_assembly_accession` TEXT, '
            'report_time TIMESTAMP DEFAULT NOW(), progress_status TEXT, start_time TIMESTAMP, '
            'completion_time TIMESTAMP, remapping_version TEXT, nb_variant_extracted INTEGER, '
            'nb_variant_remapped INTEGER, nb_variant_ingested INTEGER, '
            'primary key(source, taxid, assembly_accession, release_number))')
    execute_query(metadata_connection_handle, query_create_table)
예제 #23
0
def create_stats_table(private_config_xml_file, ftp_table_name):
    with get_metadata_connection_handle(
            'development',
            private_config_xml_file) as metadata_connection_handle:
        query_create_table = (
            f'CREATE TABLE IF NOT EXISTS {ftp_table_name} '
            '(_index TEXT, _id TEXT, event_ts_txt TEXT, event_ts TIMESTAMP, host TEXT, uhost TEXT,'
            ' request_time TEXT, request_year INTEGER, request_ts TIMESTAMP,'
            ' file_name TEXT, file_size BIGINT, transfer_time INTEGER,'
            ' transfer_type CHAR, direction CHAR, special_action CHAR(4), access_mode CHAR,'
            ' country CHAR(2), region TEXT, city TEXT, domain_name TEXT, isp TEXT, usage_type TEXT,'
            ' primary key(_index, _id))')
    execute_query(metadata_connection_handle, query_create_table)
예제 #24
0
 def update_files_with_ftp_path(self):
     files_query = f"select file_id, filename from evapro.browsable_file " \
                   f"where project_accession = '{self.project_accession}';"
     with self.metadata_connection_handle as conn:
         # update FTP file paths
         rows = get_all_results_for_query(conn, files_query)
         if len(rows) == 0:
             raise ValueError('Something went wrong with loading from ENA')
         for file_id, filename in rows:
             ftp_update = f"update evapro.file " \
                          f"set ftp_file = '/ftp.ebi.ac.uk/pub/databases/eva/{self.project_accession}/{filename}' " \
                          f"where file_id = '{file_id}';"
             execute_query(conn, ftp_update)
def create_table_for_progress(private_config_xml_file):
    with get_metadata_connection_handle(
            "development",
            private_config_xml_file) as metadata_connection_handle:
        query_create_table = (
            'CREATE TABLE IF NOT EXISTS eva_progress_tracker.remapping_tracker '
            '(source TEXT, taxonomy INTEGER, scientific_name TEXT, origin_assembly_accession TEXT, num_studies INTEGER NOT NULL,'
            'num_ss_ids BIGINT NOT NULL, release_version INTEGER, assembly_accession TEXT, '
            'remapping_report_time TIMESTAMP DEFAULT NOW(), remapping_status TEXT, remapping_start TIMESTAMP, '
            'remapping_end TIMESTAMP, remapping_version TEXT, num_ss_extracted INTEGER, '
            'num_ss_remapped INTEGER, num_ss_ingested INTEGER, '
            'primary key(source, taxonomy, origin_assembly_accession, release_version))'
        )
    execute_query(metadata_connection_handle, query_create_table)
예제 #26
0
def insert_into_stats(metadata_handle, row):
    assembly, project, analysis, file_id, filename, file_type, taxonomy_id = row
    check_exist_query = "select * from eva_stats.stats where assembly_accession = '{0}' and project_accession = " \
                        "'{1}' and analysis_accession = '{2}'".format(assembly, project, analysis)
    if len(get_all_results_for_query(metadata_handle, check_exist_query)) == 0:
        insert_query = "insert into eva_stats.stats(assembly_accession, project_accession, analysis_accession, " \
                       "file_id, filename, file_type, taxonomy_id) values ('{0}','{1}','{2}','{3}','{4}','{5}', " \
                       "{6})".format(assembly, project, analysis, file_id, filename, file_type, taxonomy_id)
        logger.info("Insert data for {0}, {1}, {2} in eva_stats table".format(
            assembly, project, analysis))
        execute_query(metadata_handle, insert_query)
    else:
        logger.info("Already exists {0}, {1}, {2} in eva_stats table".format(
            assembly, project, analysis))
def fill_in_from_previous_inventory(private_config_xml_file, release_version):
    query = ("select taxonomy_id, scientific_name, assembly, sources, total_num_variants, release_folder_name "
            "from dbsnp_ensembl_species.release_species_inventory where sources='DBSNP - filesystem' and release_version=2")
    with get_metadata_connection_handle("production", private_config_xml_file) as pg_conn:
        for taxonomy, scientific_name, assembly, sources, total_num_variants, release_folder_name in get_all_results_for_query(pg_conn, query):
            should_be_clustered = False
            should_be_released = False
            query_insert = (
                'INSERT INTO eva_progress_tracker.clustering_release_tracker '
                '(sources, taxonomy, scientific_name, assembly_accession, release_version, should_be_clustered, '
                'should_be_released, release_folder_name) '
                f"VALUES ('{sources}', {taxonomy}, '{scientific_name}', '{assembly}', {release_version}, "
                f"{should_be_clustered}, {should_be_released}, '{release_folder_name}') ON CONFLICT DO NOTHING"
            )
            execute_query(pg_conn, query_insert)
def insert_new_entry_for_taxonomy_assembly(pg_conn, sources, rs_count, release_version, taxonomy, assembly, reference_directory):
    logger.info(f'inserting rs count({rs_count}) for taxonomy({taxonomy}) and assembly({assembly})')
    scientific_name = get_scientific_name(pg_conn, taxonomy)
    release_folder_name = normalise_taxon_scientific_name(scientific_name)
    ncbi_assembly = NCBIAssembly(assembly, scientific_name, reference_directory)
    fasta_path = ncbi_assembly.assembly_fasta_path
    report_path = ncbi_assembly.assembly_report_path
    tempmongo_instance = get_tempmongo_instance(pg_conn, taxonomy)
    should_be_clustered = False
    should_be_released = True
    query_insert = (
        'INSERT INTO eva_progress_tracker.clustering_release_tracker '
        '(sources, taxonomy, scientific_name, assembly_accession, release_version, should_be_clustered, '
        'fasta_path, report_path, tempmongo_instance, should_be_released, release_folder_name) '
        f"VALUES ('{sources}', {taxonomy}, '{scientific_name}', '{assembly}', {release_version}, "
        f"{should_be_clustered}, '{fasta_path}', '{report_path}', '{tempmongo_instance}', {should_be_released}, "
        f"'{release_folder_name}') ON CONFLICT DO NOTHING")
    execute_query(pg_conn, query_insert)
예제 #29
0
def store_accessioning_counts(mongo_handle, metadata_handle, collection_name, provided_assemblies):
    collection = mongo_handle["eva_accession_sharded"][collection_name]
    assembly_field = "asm" if collection_name == "clusteredVariantEntity" else "seq"
    stats_table = "rs_stats" if collection_name == "clusteredVariantEntity" else "ss_stats"
    assemblies = provided_assemblies if provided_assemblies else collection.distinct(assembly_field)
    for assembly in assemblies:
        pipeline = [
            {"$match": {assembly_field: assembly}},
            {"$group": {"_id": assembly, "count": {"$sum": 1}}}
        ]
        cursor_stat = collection.aggregate(pipeline=pipeline, allowDiskUse=True)
        for stat in cursor_stat:
            logger.info(stat)
            assembly = stat["_id"]
            count = stat["count"]
            query = "insert into eva_stats.{2} values ('{0}', {1}) " \
                    "on conflict(assembly) do update set num_variants = {1}".format(assembly, count, stats_table)
            execute_query(metadata_handle, query)
예제 #30
0
    def update_loaded_assembly_in_browsable_files(self):
        # find assembly associated with each browseable file and copy it to the browsable file table
        query = (
            'select bf.file_id, a.vcf_reference_accession '
            'from analysis a '
            'join analysis_file af on a.analysis_accession=af.analysis_accession '
            'join browsable_file bf on af.file_id=bf.file_id '
            f"where bf.project_accession='{self.project_accession}';")
        with self.metadata_connection_handle as conn:
            rows = get_all_results_for_query(conn, query)
            if len(rows) == 0:
                raise ValueError('Something went wrong with loading from ENA')

            # Update each file with its associated assembly accession
            for file_id, assembly_accession in rows:
                ftp_update = f"update evapro.browsable_file " \
                             f"set loaded_assembly = '{assembly_accession}' " \
                             f"where file_id = '{file_id}';"
                execute_query(conn, ftp_update)