def collect_mongo_genbank_contigs(private_config_xml_file, assembly_accession):
    try:
        with psycopg2.connect(get_pg_metadata_uri_for_eva_profile("development", private_config_xml_file),
                              user="******") \
                as metadata_connection_handle, MongoClient(get_mongo_uri_for_eva_profile("development",
                                                                                         private_config_xml_file)) \
                as mongo_connection_handle:
            main_collections = [
                "dbsnpSubmittedVariantEntity", "submittedVariantEntity"
            ]
            for collection in main_collections:
                insert_contig_info_to_db(collection, assembly_accession,
                                         metadata_connection_handle,
                                         mongo_connection_handle)
            ops_collections = [
                "dbsnpSubmittedVariantOperationEntity",
                "submittedVariantOperationEntity"
            ]
            for collection in ops_collections:
                insert_contig_info_to_db(
                    collection,
                    assembly_accession,
                    metadata_connection_handle,
                    mongo_connection_handle,
                    assembly_attribute_prefix="inactiveObjects.")
    except Exception:
        logger.error(traceback.format_exc())
Пример #2
0
def export_all_multimap_snps_from_dbsnp_dumps(private_config_xml_file):
    result_file = "all_multimap_snp_ids_from_dbsnp_dumps.txt"
    with psycopg2.connect(get_pg_metadata_uri_for_eva_profile("production", private_config_xml_file), user="******") \
        as metadata_connection_handle:
        assembly_GCA_accession_map = get_assemblies_with_multimap_snps_for_species(
            metadata_connection_handle)
        for species_info in get_species_info(metadata_connection_handle):
            species_name = species_info["database_name"]
            logger.info("Processing species {0}...".format(species_name))
            if species_name in assembly_GCA_accession_map:
                with get_db_conn_for_species(
                        species_info) as species_connection_handle:
                    export_query = "select snp_id, assembly from dbsnp_{0}.multimap_snps " \
                                   "where assembly in ({1})"\
                        .format(species_name,",".join(["'{0}'".format(assembly) for assembly in
                                                       assembly_GCA_accession_map[species_name].keys()]))
                    logger.info("Running export query: " + export_query)
                    with open(result_file, 'a') as result_file_handle:
                        for snp_id, assembly in get_result_cursor(
                                species_connection_handle, export_query):
                            result_file_handle.write("{0},{1}\n".format(
                                snp_id,
                                assembly_GCA_accession_map[species_name]
                                [assembly]))

    run_command_with_output(
        "Sorting multimap SNP IDs from dbSNP source dumps...",
        "sort -u {0} -o {0}".format(result_file))
def copy_accessioning_collections_to_embassy(private_config_xml_file, profile, taxonomy_id, assembly_accession,
                                             collections_to_copy, release_species_inventory_table, release_version,
                                             dump_dir):
    port_forwarding_process_id, mongo_port, exit_code = None, None, None
    try:
        port_forwarding_process_id, mongo_port = open_mongo_port_to_tempmongo(private_config_xml_file, profile, taxonomy_id,
                                                                              assembly_accession, release_species_inventory_table,
                                                                              release_version)
        with psycopg2.connect(get_pg_metadata_uri_for_eva_profile(profile, private_config_xml_file),
                              user="******") as \
                metadata_connection_handle:
            # To be idempotent, clear destination tempmongo database
            destination_db_name = get_release_db_name_in_tempmongo_instance(taxonomy_id)
            MongoClient(port=mongo_port).drop_database(destination_db_name)

            release_info = get_release_inventory_info_for_assembly(taxonomy_id, assembly_accession,
                                                                   release_species_inventory_table,
                                                                   release_version, metadata_connection_handle)
            logger.info("Beginning data copy to remote MongoDB host {0} on port {1}..."
                        .format(release_info["tempmongo_instance"], mongo_port))
            collections_to_copy_map = get_collections_to_copy(collections_to_copy, sources=release_info["sources"])
            mongo_data_copy_to_remote_host(mongo_port, private_config_xml_file, profile, assembly_accession,
                                           collections_to_copy_map, dump_dir, destination_db_name)
            exit_code = 0
    except Exception as ex:
        logger.error("Encountered an error while copying species data to Embassy for assemblies in "
                     + release_info["tempmongo_instance"] + "\n" + traceback.format_exc())
        exit_code = -1
    finally:
        close_mongo_port_to_tempmongo(port_forwarding_process_id)
        logger.info("Copy process completed with exit_code: " + str(exit_code))
        sys.exit(exit_code)
Пример #4
0
def merge_dbsnp_eva_release_files(private_config_xml_file, profile, bgzip_path,
                                  bcftools_path, vcf_sort_script_path,
                                  taxonomy_id, assembly_accession,
                                  release_species_inventory_table,
                                  release_version, species_release_folder):
    with psycopg2.connect(get_pg_metadata_uri_for_eva_profile(profile, private_config_xml_file), user="******") \
        as metadata_connection_handle:
        release_info = get_release_inventory_info_for_assembly(
            taxonomy_id, assembly_accession, release_species_inventory_table,
            release_version, metadata_connection_handle)
        merge_commands = []
        for vcf_file_category in release_vcf_file_categories:
            merge_commands.extend(
                merge_dbsnp_eva_vcf_files(bgzip_path, bcftools_path,
                                          vcf_sort_script_path,
                                          assembly_accession,
                                          species_release_folder,
                                          vcf_file_category,
                                          release_info["sources"]))
        for text_release_file_category in release_text_file_categories:
            merge_commands.extend(
                merge_dbsnp_eva_text_files(assembly_accession,
                                           species_release_folder,
                                           text_release_file_category,
                                           release_info["sources"]))
        final_merge_command = " && ".join(merge_commands)
        run_command_with_output(
            "Merging dbSNP and EVA release files for assembly: " +
            assembly_accession, final_merge_command)
Пример #5
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Get possible assemblies where given Genbank contigs are present',
        formatter_class=argparse.RawTextHelpFormatter,
        add_help=False)
    parser.add_argument(
        "--private-config-xml-file",
        help=
        "Full path to private configuration file (ex: /path/to/settings.xml)",
        required=True)
    parser.add_argument("--eutils-api-key",
                        help="EUtils API key",
                        required=True)
    args = parser.parse_args()

    Entrez.api_key = args.eutils_api_key
    create_table_to_collect_possible_assemblies(args.private_config_xml_file)
    with psycopg2.connect(get_pg_metadata_uri_for_eva_profile("development", args.private_config_xml_file),
                          user="******") \
            as metadata_connection_handle:
        for contig_accession in sys.stdin:
            contig_accession = contig_accession.strip()
            logger.info(
                f"Getting possible assemblies for {contig_accession} from EUtils..."
            )
            possible_assemblies = get_assemblies_where_contig_appears(
                contig_accession)
            insert_possible_assemblies_for_contig(metadata_connection_handle,
                                                  contig_accession,
                                                  possible_assemblies)
def run_release_for_species(common_release_properties_file, taxonomy_id,
                            memory):
    common_release_properties = get_common_release_properties(
        common_release_properties_file)
    private_config_xml_file = common_release_properties[
        "private-config-xml-file"]
    release_species_inventory_table = common_release_properties[
        "release-species-inventory-table"]
    release_version = common_release_properties["release-version"]
    with psycopg2.connect(get_pg_metadata_uri_for_eva_profile("development", private_config_xml_file), user="******") \
        as metadata_connection_handle:
        release_assemblies = get_release_assemblies_for_taxonomy(
            taxonomy_id, release_species_inventory_table, release_version,
            metadata_connection_handle)
        workflow_file_name, release_log_file = prepare_release_workflow_file_for_species(
            common_release_properties, taxonomy_id, release_assemblies, memory)
        workflow_report_file_name = workflow_file_name.replace(
            ".nf", ".report.html")
        if os.path.exists(workflow_report_file_name):
            os.remove(workflow_report_file_name)
        workflow_command = "cd {0} && {1} run {2} -c {3} -with-report {4} -bg".format(
            os.path.dirname(release_log_file),
            common_release_properties["nextflow-binary-path"],
            workflow_file_name,
            common_release_properties["nextflow-config-path"],
            workflow_report_file_name)
        logger.info("Check log file in: " + release_log_file +
                    " to monitor progress...")
        logger.info(
            "Running workflow file {0} with the following command:\n\n {1} \n\n"
            "Use the above command with -resume if this workflow needs to be resumed in the future"
            .format(workflow_file_name, workflow_command))
        os.system(workflow_command)
Пример #7
0
def get_assemblies_from_evapro(private_config_xml_file):
    metadata_handle = psycopg2.connect(get_pg_metadata_uri_for_eva_profile(
        "development", private_config_xml_file),
                                       user="******")
    query = "select assembly_accession from accessioned_assembly where assembly_accession like 'GCA%'"
    evapro_assemblies = get_all_results_for_query(metadata_handle, query)
    return [asm[0] for asm in evapro_assemblies]
def create_table_to_collect_assembly_report_genbank_contigs(private_config_xml_file):    
    with psycopg2.connect(get_pg_metadata_uri_for_eva_profile("development", private_config_xml_file), user="******") \
            as metadata_connection_handle:
        create_table_to_store_asm_report_contigs_query = "create table if not exists {0} " \
                                                         "(assembly_accession text, contig_accession text, " \
                                                         "chromosome_name text)".format(asm_report_contigs_table_name)

        execute_query(metadata_connection_handle, create_table_to_store_asm_report_contigs_query)
Пример #9
0
def create_table_to_collect_possible_assemblies(private_config_xml_file):
    with psycopg2.connect(get_pg_metadata_uri_for_eva_profile("development", private_config_xml_file), user="******") \
            as metadata_connection_handle:
        create_table_to_store_possible_assemblies_query = "create table if not exists {0} " \
                                                         "(genbank_accession text, assembly_accession text, " \
                                                          "primary key (genbank_accession, assembly_accession))"\
            .format(possible_assemblies_table_name)

        execute_query(metadata_connection_handle,
                      create_table_to_store_possible_assemblies_query)
Пример #10
0
def create_table_accession_counts(private_config_xml_file):
    with psycopg2.connect(get_pg_metadata_uri_for_eva_profile(
            "development", private_config_xml_file),
                          user="******") as metadata_connection_handle:
        query_create_table = (
            'CREATE TABLE IF NOT EXISTS eva_stats.submitted_variants_load_counts '
            '(source TEXT, taxid INTEGER, assembly_accession TEXT, project_accession TEXT, date_loaded TIMESTAMP, '
            'number_submitted_variants BIGINT NOT NULL, '
            'primary key(taxid, assembly_accession, project_accession, date_loaded))'
        )
    execute_query(metadata_connection_handle, query_create_table)
Пример #11
0
def create_table_for_count_validation(private_config_xml_file):
    with psycopg2.connect(get_pg_metadata_uri_for_eva_profile(
            "development", private_config_xml_file),
                          user="******") as metadata_connection_handle:
        query_create_table_for_count_validation = "create table if not exists {0} " \
                                                  "(mongo_host text, database text, collection text, " \
                                                  "document_count bigint not null, report_time timestamp, " \
                                                  "primary key(mongo_host, database, collection, report_time))" \
            .format(mongo_migration_count_validation_table_name)

    execute_query(metadata_connection_handle,
                  query_create_table_for_count_validation)
Пример #12
0
def insert_count_validation_result_to_db(private_config_xml_file,
                                         count_validation_res_list):
    if len(count_validation_res_list) > 0:
        with psycopg2.connect(get_pg_metadata_uri_for_eva_profile(
                "development", private_config_xml_file),
                              user="******") as metadata_connection_handle:
            with metadata_connection_handle.cursor() as cursor:
                psycopg2.extras.execute_values(
                    cursor, "INSERT INTO {0} "
                    "(mongo_host, database, collection, document_count,report_time) "
                    "VALUES %s".format(
                        mongo_migration_count_validation_table_name),
                    [count_validation_res_list])
def create_table_for_progress(private_config_xml_file):
    with psycopg2.connect(get_pg_metadata_uri_for_eva_profile(
            "development", private_config_xml_file),
                          user="******") as metadata_connection_handle:
        query_create_table = (
            'CREATE TABLE IF NOT EXISTS remapping_progress '
            '(source TEXT, taxid INTEGER, scientific_name TEXT, assembly_accession TEXT, number_of_study INTEGER NOT NULL,'
            'number_submitted_variants BIGINT NOT NULL, release_number INTEGER, `target_assembly_accession` TEXT, '
            'report_time TIMESTAMP DEFAULT NOW(), progress_status TEXT, start_time TIMESTAMP, '
            'completion_time TIMESTAMP, remapping_version TEXT, nb_variant_extracted INTEGER, '
            'nb_variant_remapped INTEGER, nb_variant_ingested INTEGER, '
            'primary key(source, taxid, assembly_accession, release_number))')
    execute_query(metadata_connection_handle, query_create_table)
def create_table_to_collect_mongo_genbank_contigs(private_config_xml_file):
    with psycopg2.connect(get_pg_metadata_uri_for_eva_profile("development", private_config_xml_file), user="******") \
            as metadata_connection_handle:
        create_table_to_store_asm_report_contigs_query = "create table if not exists {0} " \
                                                         "(source text, assembly_accession text, " \
                                                         "study text, contig_accession text, chromosome_name text, " \
                                                         "num_entries_in_db bigint, is_contig_in_asm_report boolean, " \
                                                         "primary key(source, assembly_accession, study, " \
                                                         "contig_accession))"\
            .format(mongo_genbank_contigs_table_name)

        execute_query(metadata_connection_handle,
                      create_table_to_store_asm_report_contigs_query)
Пример #15
0
def update_release_status_for_assembly(private_config_xml_file, taxonomy_id,
                                       assembly_accession, release_version):
    with psycopg2.connect(get_pg_metadata_uri_for_eva_profile(
            "development", private_config_xml_file),
                          user="******") as metadata_connection_handle:
        update_release_progress_status(metadata_connection_handle,
                                       taxonomy_id,
                                       assembly_accession,
                                       release_version,
                                       release_status='done')
        logger.info(
            "Successfully marked release status as 'Done' in {0} for taxonomy {1} and assembly {2}"
            .format(release_progress_table, taxonomy_id, assembly_accession))
def add_mapping_weight_attribute_for_dbsnp_species(private_config_xml_file, dbsnp_species_name):
    try:
        with psycopg2.connect(get_pg_metadata_uri_for_eva_profile("production", private_config_xml_file),
                              user="******") as metadata_connection_handle:
            dbsnp_species_taxonomy = int(dbsnp_species_name.split("_")[-1])
            import_mapping_weight_attribute_for_dbsnp_species(private_config_xml_file, metadata_connection_handle,
                                                              dbsnp_species_taxonomy)
    except Exception as ex:
        logger.error("Encountered an error while adding mapping attribute for " + dbsnp_species_name + "\n" +
                     "\n".join(traceback.format_exception(type(ex), ex, ex.__traceback__)))
        sys.exit(1)

    sys.exit(0)
def insert_remapping_progress_to_db(private_config_xml_file, dataframe):
    list_to_remap = dataframe.values.tolist()
    if len(list_to_remap) > 0:
        with psycopg2.connect(get_pg_metadata_uri_for_eva_profile(
                "development", private_config_xml_file),
                              user="******") as metadata_connection_handle:
            with metadata_connection_handle.cursor() as cursor:
                query_insert = (
                    'INSERT INTO remapping_progress '
                    '(source, taxid, scientific_name, assembly_accession, number_of_study, '
                    'number_submitted_variants, target_assembly_accession, release_number) '
                    'VALUES %s')
                psycopg2.extras.execute_values(cursor, query_insert,
                                               list_to_remap)
def publish_release_files_to_ftp(common_release_properties_file, taxonomy_id):
    release_properties = ReleaseProperties(common_release_properties_file)
    create_requisite_folders(release_properties)
    # Release README, known issues etc.,
    publish_release_top_level_files_to_ftp(release_properties)

    metadata_password = get_properties_from_xml_file(
        "development",
        release_properties.private_config_xml_file)["eva.evapro.password"]
    with psycopg2.connect(
            get_pg_metadata_uri_for_eva_profile(
                "development", release_properties.private_config_xml_file),
            user="******",
            password=metadata_password) as metadata_connection_handle:
        assemblies_to_process = get_release_assemblies_info_for_taxonomy(
            taxonomy_id, release_properties, metadata_connection_handle)
        species_has_unmapped_data = "Unmapped" in set([
            assembly_info["assembly"]
            for assembly_info in assemblies_to_process
        ])

        # Publish species level data
        species_current_release_folder_name, species_previous_release_folder_name = \
            get_current_and_previous_release_folders_for_taxonomy(taxonomy_id, release_properties,
                                                                  metadata_connection_handle)

        create_species_folder(release_properties,
                              species_current_release_folder_name)

        # Unmapped variant data is published at the species level
        # because they are not mapped to any assemblies (duh!)
        if species_has_unmapped_data:
            publish_species_level_files_to_ftp(
                release_properties, species_current_release_folder_name,
                species_previous_release_folder_name,
                species_has_unmapped_data)

        # Publish assembly level data
        for current_release_assembly_info in \
                get_release_assemblies_for_release_version(assemblies_to_process, release_properties.release_version):
            if current_release_assembly_info["assembly"] != "Unmapped":
                publish_assembly_release_files_to_ftp(
                    current_release_assembly_info, release_properties)

        # Symlinks with assembly names in the species folder ex: Sorbi1 -> GCA_000003195.1
        create_assembly_name_symlinks(
            get_folder_path_for_species(
                release_properties.public_ftp_current_release_folder,
                species_current_release_folder_name))
def get_accession_counts_per_assembly(private_config_xml_file, source):
    accession_count = {}
    with psycopg2.connect(get_pg_metadata_uri_for_eva_profile(
            "development", private_config_xml_file),
                          user="******") as pg_conn:
        query = (
            'SELECT assembly_accession, taxid, SUM(number_submitted_variants) '
            'FROM eva_stats.submitted_variants_load_counts '
            "WHERE source='%s'"
            'GROUP BY assembly_accession, taxid ' % source)
        for assembly_accession, taxid, count_ssid in get_all_results_for_query(
                pg_conn, query):
            accession_count[assembly_accession] = (assembly_accession, taxid,
                                                   count_ssid)
    return accession_count
def update_release_status_for_assembly(private_config_xml_file, profile,
                                       release_species_inventory_table,
                                       taxonomy_id, assembly_accession,
                                       release_version):
    with psycopg2.connect(get_pg_metadata_uri_for_eva_profile(
            profile, private_config_xml_file),
                          user="******") as metadata_connection_handle:
        update_release_progress_status(metadata_connection_handle,
                                       release_species_inventory_table,
                                       taxonomy_id,
                                       assembly_accession,
                                       release_version,
                                       release_status='Completed')
        logger.info(
            "Successfully marked release status as 'Completed' in {0} for taxonomy {1} and assembly {2}"
            .format(release_species_inventory_table, taxonomy_id,
                    assembly_accession))
def validate_release_vcf_files(private_config_xml_file, profile, taxonomy_id,
                               assembly_accession,
                               release_species_inventory_table,
                               release_version, species_release_folder,
                               vcf_validator_path, assembly_checker_path):
    run_command_with_output(
        "Remove existing VCF validation and assembly report outputs...",
        "rm -f {0}/{1}/{2} {0}/{1}/{3}".format(
            species_release_folder, assembly_accession,
            vcf_validation_output_file_pattern,
            asm_report_output_file_pattern))
    validate_release_vcf_files_commands = []
    with psycopg2.connect(get_pg_metadata_uri_for_eva_profile(profile, private_config_xml_file),
                          user="******") as \
            metadata_connection_handle:
        release_inventory_info_for_assembly = get_release_inventory_info_for_assembly(
            taxonomy_id, assembly_accession, release_species_inventory_table,
            release_version, metadata_connection_handle)
        fasta_path = release_inventory_info_for_assembly["fasta_path"]
        assembly_report_path = release_inventory_info_for_assembly[
            "report_path"]
        remove_index_if_outdated(fasta_path)
        if assembly_report_path.startswith("file:/"):
            assembly_report_path = assembly_report_path.replace("file:/", "/")

        for vcf_file_category in release_vcf_file_categories:

            release_vcf_file_name = get_release_vcf_file_name(
                species_release_folder, assembly_accession, vcf_file_category)
            release_vcf_dir = os.path.dirname(release_vcf_file_name)
            if "multimap" not in vcf_file_category:
                validate_release_vcf_files_commands.append(
                    "({0} -i {1} -o {2}) || true".format(
                        vcf_validator_path, release_vcf_file_name,
                        release_vcf_dir))
                validate_release_vcf_files_commands.append(
                    "({0} -i {1} -f {2} -a {3} -o {4} -r text,summary) || true"
                    .format(assembly_checker_path, release_vcf_file_name,
                            fasta_path, assembly_report_path, release_vcf_dir))

        # We don't expect the validation commands to all pass, hence use semi-colon to run them back to back
        final_validate_command = " ; ".join(
            validate_release_vcf_files_commands)
        run_command_with_output(
            "Validating release files for assembly: " + assembly_accession,
            final_validate_command)
Пример #22
0
def insert_accession_counts_to_db(private_config_xml_file, accession_counts,
                                  source):
    if len(accession_counts) > 0:
        with psycopg2.connect(get_pg_metadata_uri_for_eva_profile(
                "development", private_config_xml_file),
                              user="******") as metadata_connection_handle:
            with metadata_connection_handle.cursor() as cursor:
                query_insert = (
                    'INSERT INTO eva_stats.submitted_variants_load_counts '
                    '(source, assembly_accession, taxid, project_accession, date_loaded, number_submitted_variants) '
                    'VALUES %s '
                    'ON CONFLICT (taxid, assembly_accession, project_accession, date_loaded) '
                    'DO UPDATE SET number_submitted_variants = EXCLUDED.number_submitted_variants'
                )
                psycopg2.extras.execute_values(
                    cursor, query_insert, accession_counts,
                    ("('" + source + "', %s, %s, %s, %s, %s)"))
Пример #23
0
def main(private_config_xml_file):
    private_config_xml_file
    with psycopg2.connect(get_pg_metadata_uri_for_eva_profile("development", private_config_xml_file), user="******") as pg_conn:
        query = (
            "select distinct origin_assembly_accession, assembly_accession "
            "from eva_progress_tracker.remapping_tracker "
            "where origin_assembly_accession!=assembly_accession and num_ss_ids>0"
        )
        source_assembly, target_assembly = get_all_results_for_query(pg_conn, query)
        source_assembly_info = get_ncbi_assembly_dicts_from_term(source_assembly)[0]
        target_assembly_info = get_ncbi_assembly_dicts_from_term(target_assembly)[0]
        source_taxid = source_assembly_info['speciestaxid']
        target_taxid = target_assembly_info['speciestaxid']
        source_organism = source_assembly_info['organism']
        target_organism = target_assembly_info['organism']
        if source_taxid != target_taxid:
            print(f'{source_assembly} and {target_assembly} have different source species {source_organism} != {target_organism}')
def get_release_properties_for_assembly(private_config_xml_file, profile, taxonomy_id, assembly_accession,
                                        release_species_inventory_table, release_version, species_release_folder):
    with psycopg2.connect(get_pg_metadata_uri_for_eva_profile(profile, private_config_xml_file),
                          user="******") as \
            metadata_connection_handle:
        release_inventory_info_for_assembly = get_release_inventory_info_for_assembly(taxonomy_id, assembly_accession,
                                                                                      release_species_inventory_table,
                                                                                      release_version,
                                                                                      metadata_connection_handle)
    if not release_inventory_info_for_assembly["report_path"].startswith("file:"):
        release_inventory_info_for_assembly["report_path"] = "file:" + \
                                                             release_inventory_info_for_assembly["report_path"]
    release_inventory_info_for_assembly["output_folder"] = os.path.join(species_release_folder, assembly_accession)
    release_inventory_info_for_assembly["mongo_accessioning_db"] = \
        get_release_db_name_in_tempmongo_instance(taxonomy_id)
    return merge_two_dicts(release_inventory_info_for_assembly,
                           get_release_job_repo_properties(private_config_xml_file, profile))
Пример #25
0
def open_mongo_port_to_tempmongo(private_config_xml_file, profile, taxonomy_id,
                                 assembly, release_species_inventory_table,
                                 release_version):
    MONGO_PORT = 27017
    local_forwarded_port = get_available_local_port(MONGO_PORT)
    try:
        with psycopg2.connect(get_pg_metadata_uri_for_eva_profile(profile, private_config_xml_file),
                              user="******") as \
                metadata_connection_handle:
            tempmongo_instance = get_target_mongo_instance_for_assembly(
                taxonomy_id, assembly, release_species_inventory_table,
                release_version, metadata_connection_handle)
            logger.info(
                "Forwarding remote MongoDB port 27017 to local port {0}...".
                format(local_forwarded_port))
            port_forwarding_process_id = forward_remote_port_to_local_port(
                tempmongo_instance, MONGO_PORT, local_forwarded_port)
            return port_forwarding_process_id, local_forwarded_port
    except Exception:
        raise Exception(
            "Encountered an error while opening a port to the remote MongoDB instance: "
            + tempmongo_instance + "\n" + traceback.format_exc())
def collect_assembly_report_genbank_contigs(private_config_xml_file, assembly_accession):
    try:
        with psycopg2.connect(get_pg_metadata_uri_for_eva_profile("development", private_config_xml_file),
                              user="******") \
                as metadata_connection_handle:
            assembly_report_url = assembly_utils.get_assembly_report_url(assembly_accession)
            assembly_report_file_name = os.path.basename(assembly_report_url)
            os.system("rm -f " + assembly_report_file_name)
            wget.download(assembly_report_url)

            insert_chunk_size = 100
            contig_info_list = []
            for line in open(assembly_report_file_name, 'r'):
                if not line.strip().startswith("#"):
                    line_components = line.strip().split("\t")
                    chromosome_name, genbank_accession = line_components[0], line_components[4]
                    contig_info_list.append((assembly_accession, genbank_accession, chromosome_name))
                    if len(contig_info_list) == insert_chunk_size:
                        insert_contigs_to_db(metadata_connection_handle, contig_info_list)
                        contig_info_list = []
            insert_contigs_to_db(metadata_connection_handle, contig_info_list)
    except Exception:
        logger.error(traceback.format_exc())
def main():
    argparser = ArgumentParser()
    argparser.add_argument("--private-config-xml-file",
                           help="ex: /path/to/eva-maven-settings.xml",
                           required=True)
    argparser.add_argument("--assembly_accession",
                           help="GCA_000003205.1",
                           required=True)
    argparser.add_argument("--assembly_report_path",
                           help="path to the report to check contigs against",
                           required=True)
    args = argparser.parse_args()

    genbank_to_row = get_contig_genbank(args.assembly_report_path)

    log_cfg.add_stdout_handler()

    with psycopg2.connect(get_pg_metadata_uri_for_eva_profile(
            "development", args.private_config_xml_file),
                          user="******") as pg_conn:
        eva_contigs, dbSNP_contigs = get_contigs_accessions_for(
            pg_conn, args.assembly_accession)

        for contig in eva_contigs:
            if contig not in genbank_to_row:
                logger.warning(
                    'For assembly {} contig {} found in EVA is not genbank in the report {}'
                    .format(args.assembly_accession, contig,
                            args.assembly_report_path))
        for contig in dbSNP_contigs:
            if contig not in genbank_to_row:
                logger.warning(
                    'For assembly {} contig {} found in dbSNP is not genbank in the report {}'
                    .format(args.assembly_accession, contig,
                            args.assembly_report_path))

    return 0
Пример #28
0
def collect_assembly_report_genbank_contigs(private_config_xml_file,
                                            assembly_accession):
    try:
        with psycopg2.connect(get_pg_metadata_uri_for_eva_profile("development", private_config_xml_file),
                              user="******") \
                as metadata_connection_handle:
            asm = NCBIAssembly(assembly_accession,
                               species_scientific_name=None,
                               reference_directory=None)
            assembly_report_file_name = os.path.basename(
                asm.assembly_report_url)
            os.system("rm -f " + assembly_report_file_name)
            wget.download(asm.assembly_report_url)

            insert_chunk_size = 100
            contig_info_list = []
            for line in open(assembly_report_file_name, 'r'):
                if not line.strip().startswith("#"):
                    line_components = line.strip().split("\t")
                    chromosome_name, genbank_accession, accession_equivalence, refseq_accession = \
                        line_components[0], line_components[4], line_components[5], line_components[6]
                    # Equivalence "Relationship" column in the assembly report indicates if
                    # Genbank and RefSeq contig accessions are equivalent
                    is_equivalent_genbank_available = (
                        accession_equivalence.strip() == "=")
                    contig_info_list.append(
                        (assembly_accession, genbank_accession,
                         chromosome_name, is_equivalent_genbank_available,
                         refseq_accession))
                    if len(contig_info_list) == insert_chunk_size:
                        insert_contigs_to_db(metadata_connection_handle,
                                             contig_info_list)
                        contig_info_list = []
            insert_contigs_to_db(metadata_connection_handle, contig_info_list)
    except Exception:
        logger.error(traceback.format_exc())
Пример #29
0
 def test_get_pg_metadata_uri_for_eva_profile(self):
     self.assertEqual(
         get_pg_metadata_uri_for_eva_profile('test', self.config_file),
         'postgresql://pgsql.example.com:5432/testdatabase')
Пример #30
0
def insert_counts_in_db(private_config_xml_file, metrics_per_assembly,
                        ranges_per_assembly):
    with psycopg2.connect(get_pg_metadata_uri_for_eva_profile("development", private_config_xml_file), user="******") \
            as metadata_connection_handle:
        for asm in metrics_per_assembly:
            # get last release data for assembly
            query_release2 = f"select * from dbsnp_ensembl_species.release_rs_statistics_per_assembly "\
                             f"where assembly_accession = '{asm}' and release_version = 2"
            logger.info(query_release2)
            asm_last_release_data = get_all_results_for_query(
                metadata_connection_handle, query_release2)

            # insert data for release 3
            taxid = ranges_per_assembly[asm]['taxid']
            scientific_name = ranges_per_assembly[asm][
                'scientific_name'].capitalize().replace('_', ' ')
            folder = f"{ranges_per_assembly[asm]['scientific_name']}/{asm}"
            release_version = 3

            release3_new_remapped_current_rs = metrics_per_assembly[asm][
                'new_remapped_current_rs']

            release3_new_clustered_current_rs = metrics_per_assembly[asm][
                'new_clustered_current_rs']
            release3_new_current_rs = release3_new_clustered_current_rs + release3_new_remapped_current_rs

            release3_new_merged_rs = metrics_per_assembly[asm]['merged_rs']
            release3_new_split_rs = metrics_per_assembly[asm]['split_rs']
            release3_new_ss_clustered = metrics_per_assembly[asm][
                'new_ss_clustered']

            insert_query = f"insert into dbsnp_ensembl_species.release_rs_statistics_per_assembly "\
                           f"(taxonomy_id, scientific_name, assembly_accession, release_folder, release_version, " \
                           f"current_rs, multi_mapped_rs, merged_rs, deprecated_rs, merged_deprecated_rs, " \
                           f"new_current_rs, new_multi_mapped_rs, new_merged_rs, new_deprecated_rs, " \
                           f"new_merged_deprecated_rs, new_ss_clustered, remapped_current_rs, " \
                           f"new_remapped_current_rs, split_rs, new_split_rs, ss_clustered, clustered_current_rs," \
                           f"new_clustered_current_rs) " \
                           f"values ({taxid}, '{scientific_name}', '{asm}', '{folder}', {release_version}, " \

            if asm_last_release_data:
                release2_current_rs = asm_last_release_data[0][5]
                release2_merged_rs = asm_last_release_data[0][7]
                release2_multi_mapped_rs = asm_last_release_data[0][6]
                release2_deprecated_rs = asm_last_release_data[0][8]
                release2_merged_deprecated_rs = asm_last_release_data[0][9]

                # get ss clustered
                query_ss_clustered = f"select sum(new_ss_clustered) " \
                                     f"from dbsnp_ensembl_species.release_rs_statistics_per_assembly " \
                                     f"where assembly_accession = '{asm}'"
                logger.info(query_ss_clustered)
                ss_clustered_previous_releases = get_all_results_for_query(
                    metadata_connection_handle, query_ss_clustered)
                release3_ss_clustered = ss_clustered_previous_releases[0][
                    0] + release3_new_ss_clustered

                # if assembly already existed -> add counts
                release3_current_rs = release2_current_rs + release3_new_current_rs
                release3_merged_rs = release2_merged_rs + release3_new_merged_rs
                # current_rs in previous releases (1 and 2) were all new clustered
                release3_clustered_current_rs = release2_current_rs + release3_new_clustered_current_rs

                insert_query_values = f"{release3_current_rs}, " \
                                      f"{release2_multi_mapped_rs}, " \
                                      f"{release3_merged_rs}, " \
                                      f"{release2_deprecated_rs}, " \
                                      f"{release2_merged_deprecated_rs}, " \
                                      f"{release3_new_current_rs}, " \
                                      f"0, " \
                                      f"{release3_new_merged_rs}, " \
                                      f"0, " \
                                      f"0, " \
                                      f"{release3_new_ss_clustered}, " \
                                      f"{release3_new_remapped_current_rs}, " \
                                      f"{release3_new_remapped_current_rs}, " \
                                      f"{release3_new_split_rs}, " \
                                      f"{release3_new_split_rs}, " \
                                      f"{release3_ss_clustered}," \
                                      f"{release3_clustered_current_rs}," \
                                      f"{release3_new_clustered_current_rs})"
            else:
                # if new assembly
                insert_query_values = f"{release3_new_current_rs}, " \
                                      f"0, " \
                                      f"{release3_new_merged_rs}, " \
                                      f"0, " \
                                      f"0, " \
                                      f"{release3_new_current_rs}, " \
                                      f"0, " \
                                      f"{release3_new_merged_rs}, " \
                                      f"0, " \
                                      f"0, " \
                                      f"{release3_new_ss_clustered}, " \
                                      f"{release3_new_remapped_current_rs}, " \
                                      f"{release3_new_remapped_current_rs}, " \
                                      f"{release3_new_split_rs}, " \
                                      f"{release3_new_split_rs}, " \
                                      f"{release3_new_ss_clustered}," \
                                      f"{release3_new_clustered_current_rs}," \
                                      f"{release3_new_clustered_current_rs})"
            insert_query = f"{insert_query} {insert_query_values}"
            logger.info(insert_query)
            execute_query(metadata_connection_handle, insert_query)

        # get assemblies in from release 1 and 2 not in release 3
        assemblies_in_logs = ",".join(f"'{a}'"
                                      for a in ranges_per_assembly.keys())
        query_missing_assemblies_stats = f"select * " \
                                         f"from dbsnp_ensembl_species.release_rs_statistics_per_assembly " \
                                         f"where release_version = 2 " \
                                         f"and assembly_accession not in ({assemblies_in_logs});"
        logger.info(query_missing_assemblies_stats)
        missing_assemblies_stats = get_all_results_for_query(
            metadata_connection_handle, query_missing_assemblies_stats)
        for assembly_stats in missing_assemblies_stats:
            taxonomy_id = assembly_stats[0]
            scientific_name = assembly_stats[1]
            assembly_accession = assembly_stats[2]
            release_folder = assembly_stats[3]
            current_rs = assembly_stats[5]
            multi_mapped_rs = assembly_stats[6]
            merged_rs = assembly_stats[7]
            deprecated_rs = assembly_stats[8]
            merged_deprecated_rs = assembly_stats[9]

            # get ss clustered
            query_ss_clustered = f"select sum(new_ss_clustered) " \
                                 f"from dbsnp_ensembl_species.release_rs_statistics_per_assembly " \
                                 f"where assembly_accession = '{assembly_accession}'"
            logger.info(query_ss_clustered)
            ss_clustered_previous_releases = get_all_results_for_query(
                metadata_connection_handle, query_ss_clustered)
            ss_clustered = ss_clustered_previous_releases[0][0]

            insert_query = f"insert into dbsnp_ensembl_species.release_rs_statistics_per_assembly "\
                           f"(taxonomy_id, scientific_name, assembly_accession, release_folder, release_version, " \
                           f"current_rs, multi_mapped_rs, merged_rs, deprecated_rs, merged_deprecated_rs, " \
                           f"new_current_rs, new_multi_mapped_rs, new_merged_rs, new_deprecated_rs, " \
                           f"new_merged_deprecated_rs, new_ss_clustered, remapped_current_rs, " \
                           f"new_remapped_current_rs, split_rs, new_split_rs, ss_clustered, clustered_current_rs, " \
                           f"new_clustered_current_rs) " \
                           f"values ({taxonomy_id}, '{scientific_name}', '{assembly_accession}', '{release_folder}', " \
                           f"3, {current_rs}, {multi_mapped_rs}, {merged_rs}, {deprecated_rs}, " \
                           f"{merged_deprecated_rs}, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, {ss_clustered}, 0, 0);"
            logger.info(insert_query)
            execute_query(metadata_connection_handle, insert_query)