institute=institute) # If a variant belongs to any gene lists we check which ones mongo_variant['gene_lists'] = variant['info_dict'].get( config_object['VCF']['GeneLists']['vcf_info_key'], None) ################# Add the rank score and variant rank ################# # Get the rank score as specified in the config file. # This is central for displaying variants in scout. mongo_variant['rank_score'] = float( variant.get('rank_scores', {}).get(case_name, 0.0)) ################# Add gt calls ################# gt_calls = [] for individual_id, display_name in iteritems(individuals): # This function returns an ODM GTCall object with the # relevant information for a individual: gt_calls.append( get_genotype(variant, config_object, individual_id, display_name)) mongo_variant['samples'] = gt_calls ################# Add the compound information ################# mongo_variant['compounds'] = get_compounds(variant, case, variant_type) ################# Add the inheritance patterns ################# mongo_variant['genetic_models'] = variant.get('genetic_models', {}).get(case_name, [])
def load_mongo_db(scout_configs, vcf_configs=None, family_type='cmms', mongo_db='variantDatabase', variant_type='clinical', username=None, password=None, port=27017, host='localhost', rank_score_threshold=0, variant_number_threshold=5000): """Populate a moongo database with information from ped and variant files.""" # get root path of the Flask app # project_root = '/'.join(app.root_path.split('/')[0:-1]) logger = logging.getLogger(__name__) # For testing only if __name__ == '__main__': logger = logging.getLogger("scout.ext.backend.load_mongo") ####### Check if the vcf file is on the proper format ####### vcf_file = scout_configs['load_vcf'] logger.info( "Found a vcf for loading variants into scout: {0}".format(vcf_file)) logger.info("Connecting to {0}".format(mongo_db)) connect(mongo_db, host=host, port=port, username=username, password=password) variant_database = get_db() ped_file = scout_configs['ped'] logger.info("Found a ped file: {0}".format(ped_file)) ######## Parse the config file to check for keys ######## logger.info("Parsing config file") config_object = ConfigParser(vcf_configs) ######## Get the cases and add them to the mongo db: ######## logger.info("Get the case from ped file") case = get_case(scout_configs, family_type) logger.info('Case found in {0}: {1}'.format(ped_file, case.display_name)) ######## Add the institute to the mongo db: ######## for institute_name in case['collaborators']: if institute_name: institute = get_institute(institute_name) logger.info("Institute found: {0}".format(institute)) try: Institute.objects.get(internal_id=institute.internal_id) logger.info( "Institute {0} already in database".format(institute)) except DoesNotExist: institute.save() logger.info( "Adding new institute {0} to database".format(institute)) logger.info("Updating case in database") update_case(case, variant_type, logger) ######## Get the variants and add them to the mongo db: ######## logger.info("Setting up a variant parser") variant_parser = VCFParser(infile=vcf_file, split_variants=True, skip_info_check=True) nr_of_variants = 0 logger.info("Deleting old variants for case {0}".format(case.case_id)) Variant.objects(case_id=case.case_id, variant_type=variant_type).delete() logger.debug("Variants deleted") start_inserting_variants = datetime.now() # Get the individuals to see which we should include in the analysis ped_individuals = { individual.individual_id: individual.display_name for individual in case.individuals } # Check which individuals that exists in the vcf file. # Save the individuals in a dictionary with individual ids as keys # and display names as values individuals = {} # loop over keys (internal ids) logger.info("Checking which individuals in ped file exists in vcf") for individual_id, display_name in iteritems(ped_individuals): logger.debug("Checking individual {0}".format(individual_id)) if individual_id in variant_parser.individuals: logger.debug("Individual {0} found".format(individual_id)) individuals[individual_id] = display_name else: logger.warning("Individual {0} is present in ped file but"\ " not in vcf".format(individual_id)) logger.info('Start parsing variants') ########## If a rank score threshold is used check if it is below that threshold ########## for variant in variant_parser: logger.debug("Parsing variant {0}".format(variant['variant_id'])) if not float(variant['rank_scores'][ case.display_name]) > rank_score_threshold: logger.info("Lower rank score threshold reaced after {0}"\ " variants".format(nr_of_variants)) break if nr_of_variants > variant_number_threshold: logger.info("Variant number threshold reached. ({0})".format( variant_number_threshold)) break nr_of_variants += 1 mongo_variant = get_mongo_variant(variant, variant_type, individuals, case, config_object, nr_of_variants) mongo_variant.save() if nr_of_variants % 1000 == 0: logger.info('{0} variants parsed'.format(nr_of_variants))
def load_mongo_db(scout_configs, vcf_configs=None, family_type='cmms', mongo_db='variantDatabase', variant_type='clinical', username=None, password=None, port=27017, host='localhost', rank_score_threshold = 0, variant_number_threshold = 5000): """Populate a moongo database with information from ped and variant files.""" # get root path of the Flask app # project_root = '/'.join(app.root_path.split('/')[0:-1]) logger = logging.getLogger(__name__) # For testing only if __name__ == '__main__': logger = logging.getLogger("scout.ext.backend.load_mongo") ####### Check if the vcf file is on the proper format ####### vcf_file = scout_configs['load_vcf'] logger.info("Found a vcf for loading variants into scout: {0}".format( vcf_file )) logger.info("Connecting to {0}".format(mongo_db)) connect(mongo_db, host=host, port=port, username=username, password=password) variant_database = get_db() ped_file = scout_configs['ped'] logger.info("Found a ped file: {0}".format(ped_file)) ######## Parse the config file to check for keys ######## logger.info("Parsing config file") config_object = ConfigParser(vcf_configs) ######## Get the cases and add them to the mongo db: ######## logger.info("Get the case from ped file") case = get_case(scout_configs, family_type) logger.info('Case found in {0}: {1}'.format(ped_file, case.display_name)) ######## Add the institute to the mongo db: ######## for institute_name in case['collaborators']: if institute_name: institute = get_institute(institute_name) logger.info("Institute found: {0}".format(institute)) try: Institute.objects.get(internal_id = institute.internal_id) logger.info("Institute {0} already in database".format(institute)) except DoesNotExist: institute.save() logger.info("Adding new institute {0} to database".format(institute)) logger.info("Updating case in database") update_case(case, variant_type, logger) ######## Get the variants and add them to the mongo db: ######## logger.info("Setting up a variant parser") variant_parser = VCFParser(infile=vcf_file, split_variants=True, skip_info_check=True) nr_of_variants = 0 logger.info("Deleting old variants for case {0}".format(case.case_id)) Variant.objects(case_id=case.case_id, variant_type=variant_type).delete() logger.debug("Variants deleted") start_inserting_variants = datetime.now() # Get the individuals to see which we should include in the analysis ped_individuals = {individual.individual_id: individual.display_name for individual in case.individuals} # Check which individuals that exists in the vcf file. # Save the individuals in a dictionary with individual ids as keys # and display names as values individuals = {} # loop over keys (internal ids) logger.info("Checking which individuals in ped file exists in vcf") for individual_id, display_name in iteritems(ped_individuals): logger.debug("Checking individual {0}".format(individual_id)) if individual_id in variant_parser.individuals: logger.debug("Individual {0} found".format(individual_id)) individuals[individual_id] = display_name else: logger.warning("Individual {0} is present in ped file but"\ " not in vcf".format(individual_id)) logger.info('Start parsing variants') ########## If a rank score threshold is used check if it is below that threshold ########## for variant in variant_parser: logger.debug("Parsing variant {0}".format(variant['variant_id'])) if not float(variant['rank_scores'][case.display_name]) > rank_score_threshold: logger.info("Lower rank score threshold reaced after {0}"\ " variants".format(nr_of_variants)) break if nr_of_variants > variant_number_threshold: logger.info("Variant number threshold reached. ({0})".format( variant_number_threshold)) break nr_of_variants += 1 mongo_variant = get_mongo_variant(variant, variant_type, individuals, case, config_object, nr_of_variants) mongo_variant.save() if nr_of_variants % 1000 == 0: logger.info('{0} variants parsed'.format(nr_of_variants)) logger.info("Parsing variants done") logger.info("{0} variants inserted".format(nr_of_variants)) logger.info("Time to insert variants: {0}".format( datetime.now() - start_inserting_variants )) logger.info("Updating indexes") ensure_indexes(variant_database, logger) return