示例#1
0
 def test_get_mongo_uri_for_eva_profile(self):
     self.assertEqual(
         get_mongo_uri_for_eva_profile('test', self.config_file),
         'mongodb://*****:*****@mongo.example.com:27017,mongo.example-primary.com:27017/admin'
     )
     self.assertRaises(ValueError, get_mongo_uri_for_eva_profile, 'test1',
                       self.config_file)
     # test for local mongo with no authentication
     self.assertEqual(
         get_mongo_uri_for_eva_profile('local', self.config_file),
         'mongodb://localhost:27017')
示例#2
0
def deprecate(settings_xml_file, database_name, contigs=None):
    """
    Connect to mongodb and retrieve all variants that needs to be deprecated.
    Copy the variant in the operation collection and delete them from the submitted variant collections.
    """
    with pymongo.MongoClient(
            get_mongo_uri_for_eva_profile('production',
                                          settings_xml_file)) as mongo_handle:
        variant_collection = mongo_handle[database_name]['variants_2_0']
        deleted_variant_collection = mongo_handle[database_name][
            'to_delete_variants_2_0']

        cursor = variant_collection.find({'chr': {'$in': contigs}})
        drop_statements = []
        insert_statements = []
        for variant in cursor:
            insert_statements.append(pymongo.InsertOne(variant))
            drop_statements.append(pymongo.DeleteOne({'_id': variant['_id']}))

    logger.info('Found %s variant to remove', len(drop_statements))
    result_insert = deleted_variant_collection.bulk_write(
        requests=insert_statements, ordered=False)
    result_drop = variant_collection.bulk_write(requests=drop_statements,
                                                ordered=False)
    logger.info('There was %s new documents inserted in to_delete collection' %
                result_insert.inserted_count)
    logger.info('There was %s documents dropped from ' %
                result_drop.deleted_count)

    mongo_handle.close()
def get_multimap_snps_from_mongo(private_config_xml_file,
                                 collection_to_validate):
    #  Dirty hack: since mongoexport does not allow switching databases
    #  replace admin in the URI with the database name and relegate admin to authSource
    production_mongo_uri = get_mongo_uri_for_eva_profile("production", private_config_xml_file) \
        .replace("/admin", "/eva_accession_sharded?authSource=admin")
    output_file = collection_to_validate + "_multimap_snp_ids.txt"
    accession_attribute = collection_attribute_paths[collection_to_validate][
        "rs_accession_attribute_name"].replace("inactiveObjects.",
                                               "inactiveObjects.0.")
    assembly_attribute = collection_attribute_paths[collection_to_validate][
        "assembly_attribute_name"].replace("inactiveObjects.",
                                           "inactiveObjects.0.")

    export_command = 'mongoexport --uri "{0}" --collection {1} --type=csv --fields \'{2},{3}\' ' \
                     '--query \'{{"{4}": {{$exists: true}}}}\' --noHeaderLine --out {5}' \
        .format(production_mongo_uri, collection_to_validate,
                accession_attribute, assembly_attribute,
                collection_attribute_paths[collection_to_validate]["mapping_weight_attribute_path"]
                .replace("$.", ""), output_file)
    # Mongoexport is one of those brain-damaged commands that outputs progress to stderr.
    # So, log error stream to output.
    run_command_with_output("Export multimap SNP IDs in collection: " +
                            collection_to_validate,
                            export_command,
                            log_error_stream_to_output=True)
    run_command_with_output(
        "Sorting multimap SNP IDs from collection: " + collection_to_validate,
        "sort -u {0} -o {0}".format(output_file))
    return output_file
def collect_mongo_genbank_contigs(private_config_xml_file, assembly_accession):
    try:
        with psycopg2.connect(get_pg_metadata_uri_for_eva_profile("development", private_config_xml_file),
                              user="******") \
                as metadata_connection_handle, MongoClient(get_mongo_uri_for_eva_profile("development",
                                                                                         private_config_xml_file)) \
                as mongo_connection_handle:
            main_collections = [
                "dbsnpSubmittedVariantEntity", "submittedVariantEntity"
            ]
            for collection in main_collections:
                insert_contig_info_to_db(collection, assembly_accession,
                                         metadata_connection_handle,
                                         mongo_connection_handle)
            ops_collections = [
                "dbsnpSubmittedVariantOperationEntity",
                "submittedVariantOperationEntity"
            ]
            for collection in ops_collections:
                insert_contig_info_to_db(
                    collection,
                    assembly_accession,
                    metadata_connection_handle,
                    mongo_connection_handle,
                    assembly_attribute_prefix="inactiveObjects.")
    except Exception:
        logger.error(traceback.format_exc())
def deprecate(settings_xml_file, study, assembly_accession, contigs=None):
    """
    Connect to mongodb and retrieve all variants that needs to be deprecated.
    Copy the variant in the operation collection and delete them from the submitted variant collections.
    """
    with pymongo.MongoClient(get_mongo_uri_for_eva_profile('production', settings_xml_file)) as accessioning_mongo_handle:
        sve_collection = accessioning_mongo_handle['eva_accession_sharded']["submittedVariantEntity"]
        deprecated_sve_collection = accessioning_mongo_handle['eva_accession_sharded']["submittedVariantOperationEntity"]
        cursor = sve_collection.find({'seq': assembly_accession, 'study': study, 'contig': {'$in': contigs}})
        insert_statements = []
        drop_statements = []
        for variant in cursor:
            insert_statements.append(pymongo.InsertOne(inactive_object(variant)))
            drop_statements.append(pymongo.DeleteOne({'_id': variant['_id']}))

    # There should only be 458 variant to deprecate
    assert len(insert_statements) == 458
    assert len(drop_statements) == 458

    logger.info('Found %s variant to deprecate', len(insert_statements))

    result_insert = deprecated_sve_collection.bulk_write(requests=insert_statements, ordered=False)
    result_drop = sve_collection.bulk_write(requests=drop_statements, ordered=False)
    logger.info('There was %s new documents inserted in inactive entities' % result_insert.inserted_count)
    logger.info('There was %s old documents dropped from ' % result_drop.deleted_count)
    accessioning_mongo_handle.close()
def mongo_data_copy_to_remote_host(local_forwarded_port, private_config_xml_file, profile, assembly_accession,
                                   collections_to_copy_map, dump_dir, destination_db_name):
    mongo_params = parse_uri(get_mongo_uri_for_eva_profile(profile, private_config_xml_file))
    # nodelist is in format: [(host1,port1), (host2,port2)]. Just choose one.
    # Mongo is smart enough to fallback to secondaries automatically.
    mongo_host = mongo_params["nodelist"][0][0]
    logger.info("Beginning data copy for assembly: " + assembly_accession)
    dump_output_dir = "{0}/dump_{1}".format(dump_dir, assembly_accession.replace(".", "_"))

    # To be idempotent, clear source dump files
    shutil.rmtree(dump_output_dir, ignore_errors=True)

    for collection, collection_assembly_attribute_name in sorted(collections_to_copy_map.items()):
        logger.info("Begin processing collection: " + collection)
        # Curly braces when they are not positional parameters
        query = """'{{"{0}": {{"$in":["{1}"]}}}}'""".format(collection_assembly_attribute_name, assembly_accession)
        sharded_db_name = "eva_accession_sharded"
        mongodump_args = {"db": sharded_db_name, "host": mongo_host,
                          "username": mongo_params["username"], "password": mongo_params["password"],
                          "authenticationDatabase": "admin", "collection": collection,
                          "query": query, "out": dump_output_dir
                          }
        mongorestore_args = {"db": destination_db_name,
                             "dir": "{0}/{1}/{2}.bson".format(dump_output_dir, sharded_db_name, collection),
                             "collection": collection, "port": local_forwarded_port}
        logger.info("Running export to {0} with query {1} against {2}.{3} in {4}"
                    .format(dump_output_dir, query, sharded_db_name, collection, profile))
        copy_db(mongodump_args, mongorestore_args)
def copy_database_to(source_database, destination_database,
                     private_config_xml_file, dump_dir):
    mongo_params = parse_uri(
        get_mongo_uri_for_eva_profile("production", private_config_xml_file))
    # nodelist is in format: [(host1,port1), (host2,port2)]. Just choose one.
    # Mongo is smart enough to fallback to secondaries automatically.
    mongo_host = mongo_params["nodelist"][0][0]
    logger.info("Beginning data copy for: " + source_database)
    dump_output_dir = "{0}/dump_{1}".format(dump_dir,
                                            source_database.replace(".", "_"))

    mongodump_args = {
        "db": source_database,
        "host": mongo_host,
        "username": mongo_params["username"],
        "password": mongo_params["password"],
        "authenticationDatabase": "admin",
        "out": dump_output_dir
    }
    mongorestore_args = {
        "db": destination_database,
        "host": mongo_host,
        "username": mongo_params["username"],
        "password": mongo_params["password"],
        "dir": "{0}/{1}/".format(dump_output_dir, source_database)
    }
    logger.info("Running export to {0} against {2} in production".format(
        dump_output_dir, source_database))
    copy_db(mongodump_args, mongorestore_args)
def main():
    parser = argparse.ArgumentParser(
        description='Create and load the clustering and release tracking table',
        add_help=False)
    parser.add_argument("--private-config-xml-file",
                        help="ex: /path/to/eva-maven-settings.xml",
                        required=True)
    parser.add_argument("--release-version",
                        help="version of the release",
                        type=int,
                        required=True)
    parser.add_argument(
        "--reference-directory",
        help=
        "Directory where the reference genomes exists or should be downloaded",
        required=True)
    parser.add_argument(
        "--taxonomy",
        help="taxonomy id for which rs count needs to be updated",
        type=int,
        required=False)
    parser.add_argument('--tasks',
                        required=False,
                        type=str,
                        nargs='+',
                        default=all_tasks,
                        choices=all_tasks,
                        help='Task or set of tasks to perform.')
    parser.add_argument('--help',
                        action='help',
                        help='Show this help message and exit')
    args = parser.parse_args()

    logging_config.add_stdout_handler()

    if not args.tasks:
        args.tasks = all_tasks

    if 'create_and_fill_table' in args.tasks:
        create_table(args.private_config_xml_file)
        fill_in_from_previous_inventory(args.private_config_xml_file,
                                        args.release_version)
        fill_in_table_from_remapping(args.private_config_xml_file,
                                     args.release_version,
                                     args.reference_directory)

    if 'fill_rs_count' in args.tasks:
        if not args.taxonomy:
            raise Exception(
                "For running task 'fill_rs_count', it is mandatory to provide taxonomy arguments"
            )
        mongo_source_uri = get_mongo_uri_for_eva_profile(
            'production', args.private_config_xml_file)
        mongo_source = MongoDatabase(uri=mongo_source_uri,
                                     db_name="eva_accession_sharded")
        fill_num_rs_id_for_taxonomy_and_assembly(mongo_source,
                                                 args.private_config_xml_file,
                                                 args.release_version,
                                                 args.taxonomy,
                                                 args.reference_directory)
 def __init__(self, eload_number):
     super().__init__(eload_number)
     self.settings_xml_file = cfg['maven']['settings_file']
     self.project_accession = self.eload_cfg.query('brokering', 'ena',
                                                   'PROJECT')
     self.project_dir = self.setup_project_dir()
     self.mongo_uri = get_mongo_uri_for_eva_profile(
         cfg['maven']['environment'], self.settings_xml_file)
示例#10
0
def check_all_contigs(private_config_xml_file,
                      databases,
                      profile='production'):
    db_assembly = get_db_name_and_assembly_accession(databases)
    for db_name, info in db_assembly.items():
        assembly = info['assembly']
        asm_report = info['asm_report']
        logger.info(
            f"Check database {db_name} (assembly {assembly}) with report {asm_report}"
        )

        contig_synonym_dictionaries = load_synonyms_for_assembly(
            assembly, asm_report)

        with pymongo.MongoClient(
                get_mongo_uri_for_eva_profile(
                    profile, private_config_xml_file)) as mongo_handle:
            variants_collection = mongo_handle[db_name]["variants_2_0"]
            cursor = variants_collection.aggregate([{
                '$group': {
                    '_id': '$chr',
                    'count': {
                        '$sum': 1
                    }
                }
            }])

        translatable_contigs = 0
        translatable_variants = 0
        notranslation_variants = 0
        notranslation_contigs = set()
        for contig in cursor:
            try:
                _ = get_genbank(contig_synonym_dictionaries, contig['_id'])
                translatable_contigs += 1
                translatable_variants += contig['count']
            except KeyError:
                notranslation_variants += contig['count']
                notranslation_contigs.add(contig['_id'])

        if len(notranslation_contigs) > 0:
            raise ValueError(
                f'Aborting Update (no changes were done). '
                f'With the provided assembly_report, the next {len(notranslation_contigs)} '
                f'contigs (present in {notranslation_variants} variants) can not be '
                f'replaced: {notranslation_contigs}')
        else:
            logger.info(
                f'Check ok. {translatable_contigs} contigs of {translatable_variants} variants will be  '
                f'translated to genbank')
        return translatable_variants
def import_mapping_weight_attribute_for_dbsnp_species(private_config_xml_file, metadata_connection_handle,
                                                      dbsnp_species_taxonomy):
    metadata_params = metadata_connection_handle.get_dsn_parameters()
    mongo_params = parse_uri(get_mongo_uri_for_eva_profile("production", private_config_xml_file))
    # nodelist is in format: [(host1,port1), (host2,port2)]. Just choose one.
    # Mongo is smart enough to fallback to secondaries automatically.
    mongo_host = mongo_params["nodelist"][0][0]
    for assembly in get_assemblies_to_import_for_dbsnp_species(metadata_connection_handle,
                                                               dbsnp_species_taxonomy, release_version=2):

        incorporate_mapping_weight_into_accessioning(metadata_params["dbname"], metadata_params["user"],
                                                     metadata_params["host"],
                                                     mongo_params["username"], mongo_params["password"], mongo_host,
                                                     assembly)
def check_RS_release_JSON_assumptions(private_config_xml_file,
                                      release_json_file,
                                      eva_production_human_dbsnp_build,
                                      eva_production_human_dbsnp_assembly):
    with bz2.open(release_json_file) as release_json_file_handle, \
            MongoClient(get_mongo_uri_for_eva_profile("production", private_config_xml_file)) \
                    as mongo_connection_handle:
        line_index = 0
        for json_line in release_json_file_handle:
            if line_index % 100000 == 0:
                logger.info("Processed {0} records...".format(line_index))
            line_index += 1
            rs_record = json.loads(json_line.decode("utf-8").strip())
            if not (is_rs_id_mapped_to_assembly(
                    rs_record, eva_production_human_dbsnp_assembly)):
                logger.error("RS ID {0} is not mapped to assembly {1}".format(
                    rs_record["refsnp_id"],
                    eva_production_human_dbsnp_assembly))
                continue
            rs_id = int(rs_record["refsnp_id"])

            if "support" in rs_record["primary_snapshot_data"]:
                support_record_count = 0
                support_record_last_updated_builds = set()
                for support_record in rs_record["present_obs_movements"]:
                    if "last_added_to_this_rs" in support_record:
                        support_record_count += 1
                        support_record_last_updated_builds.add(
                            int(support_record["last_added_to_this_rs"]))
                if support_record_count == 0:
                    logger.error(
                        "Support record not found for RS {0} at line {1}!!".
                        format(rs_id, line_index))
                    continue
                rs_last_updated_build = min(support_record_last_updated_builds)
                # Ensure newly added RS are not in production
                if rs_last_updated_build > eva_production_human_dbsnp_build:
                    ensure_new_rs_not_in_eva_human_accession_db(
                        mongo_connection_handle,
                        eva_production_human_dbsnp_build, rs_id)
                # Ensure RS in previous builds are present in production
                if rs_last_updated_build <= eva_production_human_dbsnp_build:
                    ensure_existing_rs_in_eva_human_accession_db(
                        mongo_connection_handle,
                        eva_production_human_dbsnp_build, rs_id)

        ensure_new_rs_not_in_eva_human_accession_db(
            mongo_connection_handle, eva_production_human_dbsnp_build)
        ensure_existing_rs_in_eva_human_accession_db(
            mongo_connection_handle, eva_production_human_dbsnp_build)
def are_all_unprocessed_multimap_snps_absent_in_mongo(
        private_config_xml_file, collection_to_validate,
        unprocessed_multimap_snps_in_dbsnp_file):
    chunk_size = 2000
    num_entries_looked_up = 0
    with MongoClient(
            get_mongo_uri_for_eva_profile(
                "production", private_config_xml_file)) as mongo_handle:
        with open(unprocessed_multimap_snps_in_dbsnp_file
                  ) as unprocessed_snps_in_dbsnp_file_handle:
            while True:
                snps_to_lookup_in_mongo = defaultdict(list)
                lines = list(
                    islice(unprocessed_snps_in_dbsnp_file_handle, chunk_size))
                # List of assembly, RS ID
                for id in lines:
                    assembly = id.split(",")[1].rstrip()
                    snp_id = int(id.split(",")[0].rstrip())
                    snps_to_lookup_in_mongo[assembly].append(snp_id)

                accession_attribute = collection_attribute_paths[
                    collection_to_validate]["rs_accession_attribute_name"]
                assembly_attribute = collection_attribute_paths[
                    collection_to_validate]["assembly_attribute_name"]
                if len(snps_to_lookup_in_mongo.keys()) > 0:
                    for assembly in snps_to_lookup_in_mongo.keys():
                        unprocessed_multimap_snp_from_dbsnp_present_in_mongo = \
                            mongo_handle["eva_accession_sharded"][collection_to_validate] \
                                .find_one(
                                {assembly_attribute: assembly,
                                 accession_attribute: {"$in": snps_to_lookup_in_mongo[assembly]}})
                        if unprocessed_multimap_snp_from_dbsnp_present_in_mongo is not None:
                            raise Exception(
                                "Some unprocessed multimap SNPs from dbSNP source dumps were present in Mongo. "
                                "See rs ID " +
                                str(unprocessed_multimap_snp_from_dbsnp_present_in_mongo[
                                    accession_attribute]) + " for example")
                    num_entries_looked_up += chunk_size
                    logger.info("Looked up {0} entries so far...".format(
                        num_entries_looked_up))
                else:
                    break
def correct(private_config_xml_file,
            profile='production',
            mongo_database='eva_accession_sharded'):
    with pymongo.MongoClient(
            get_mongo_uri_for_eva_profile(
                profile, private_config_xml_file)) as mongo_handle:
        sve_collection = mongo_handle[mongo_database]["submittedVariantEntity"]
        filter_criteria = {'seq': 'GCA_002742125.1', 'study': 'PRJEB42582'}
        cursor = sve_collection.find(filter_criteria)
        insert_statements = []
        drop_statements = []
        number_of_variants_to_replace = 10
        total_inserted, total_dropped = 0, 0
        try:
            for variant in cursor:
                original_id = get_SHA1(variant)
                assert variant[
                    '_id'] == original_id, "Original id is different from the one calculated %s != %s" % (
                        variant['_id'], original_id)
                variant['contig'] = 'CM008482.1'
                variant['_id'] = get_SHA1(variant)
                insert_statements.append(pymongo.InsertOne(variant))
                drop_statements.append(pymongo.DeleteOne({'_id': original_id}))
            result_insert = sve_collection.bulk_write(
                requests=insert_statements, ordered=False)
            total_inserted += result_insert.inserted_count
            result_drop = sve_collection.bulk_write(requests=drop_statements,
                                                    ordered=False)
            total_dropped += result_drop.deleted_count
            logging.info('%s / %s new documents inserted' %
                         (total_inserted, number_of_variants_to_replace))
            logging.info('%s / %s old documents dropped' %
                         (total_dropped, number_of_variants_to_replace))
        except Exception as e:
            print(traceback.format_exc())
            raise e
        finally:
            cursor.close()
        return total_inserted
示例#15
0
def get_mongo_connection_handle(profile: str,
                                settings_xml_file: str) -> pymongo.MongoClient:
    mongo_connection_uri = get_mongo_uri_for_eva_profile(
        profile, settings_xml_file)
    return pymongo.MongoClient(mongo_connection_uri)
示例#16
0
def populate_ids(private_config_xml_file,
                 databases,
                 profile='production',
                 mongo_accession_db='eva_accession_sharded'):
    db_assembly = get_db_name_and_assembly_accession(databases)
    for db_name, info in db_assembly.items():
        assembly = info['assembly']
        asm_report = info['asm_report']
        logger.info(f"Processing database {db_name} (assembly {assembly})")

        contig_synonym_dictionaries = load_synonyms_for_assembly(
            assembly, asm_report)

        with pymongo.MongoClient(
                get_mongo_uri_for_eva_profile(
                    profile, private_config_xml_file)) as mongo_handle:
            variants_collection = mongo_handle[db_name]["variants_2_0"]
            logger.info(
                f"Querying variants from variant warehouse, database {db_name}"
            )
            batch_size = 1
            variants_cursor = get_variants_from_variant_warehouse(
                variants_collection, batch_size)
            hash_to_variant_ids = {}
            update_statements = []
            try:
                count_variants = 0
                batch_number = 0
                for variant_query_result in variants_cursor:
                    hash_to_variant_id, contigs_no_genbank = get_hash_to_variant_id(
                        assembly, contig_synonym_dictionaries,
                        variant_query_result)
                    hash_to_variant_ids.update(hash_to_variant_id)
                    count_variants += 1
                    if count_variants == batch_size:
                        batch_number += 1
                        # Generate update statements
                        logger.info(
                            f"Generating update statements: database {db_name} (batch {batch_number})"
                        )
                        sve_hashes = hash_to_variant_ids.keys()
                        hash_to_accession_info = get_from_accessioning_db(
                            mongo_handle, mongo_accession_db, sve_hashes)
                        update_statements.extend(
                            generate_update_statement(hash_to_variant_ids,
                                                      hash_to_accession_info))

                        hash_to_variant_ids.clear()
                        count_variants = 0
                if count_variants > 0:
                    logger.info(
                        f"Generating update statements: database {db_name} (batch {batch_number+1})"
                    )
                    sve_hashes = hash_to_variant_ids.keys()
                    hash_to_accession_info = get_from_accessioning_db(
                        mongo_handle, mongo_accession_db, sve_hashes)
                    update_statements.extend(
                        generate_update_statement(hash_to_variant_ids,
                                                  hash_to_accession_info))
            except ValueError as e:
                print(traceback.format_exc())
                raise e
            finally:
                variants_cursor.close()

            if len(contigs_no_genbank) > 0:
                raise ValueError(
                    f"Contigs {contigs_no_genbank} don't have a genbank equivalent, check assembly report"
                )

            modified_count = 0
            if update_statements:
                result_update = variants_collection.with_options(
                    write_concern=WriteConcern(w="majority", wtimeout=1200000)) \
                    .bulk_write(requests=update_statements, ordered=False)
                modified_count = result_update.modified_count if result_update else 0
                logger.info(f"{modified_count} variants modified in {db_name}")

        return modified_count
示例#17
0
 def test_get_mongo_uri_for_eva_profile(self):
     self.assertEqual(
         get_mongo_uri_for_eva_profile('test', self.config_file),
         'mongodb://*****:*****@mongo.example.com:27017/admin')
     self.assertRaises(ValueError, get_mongo_uri_for_eva_profile, 'test1',
                       self.config_file)
示例#18
0
 def __init__(self, eload_number, config_object: EloadConfig = None):
     super().__init__(eload_number, config_object)
     self.project_accession = self.eload_cfg.query('brokering', 'ena',
                                                   'PROJECT')
     self.mongo_uri = get_mongo_uri_for_eva_profile(
         cfg['maven']['environment'], cfg['maven']['settings_file'])