def test_get_profiles(real_mongo_adapter, profile_vcf_path, zipped_vcf_path): # Load profile variants load_profile_variants(real_mongo_adapter, profile_vcf_path) vcf_info = check_vcf(zipped_vcf_path) # Get profiles from vcf profiles = get_profiles(real_mongo_adapter, zipped_vcf_path) # Assert that all individuals are included assert list(profiles.keys()) == vcf_info["individuals"] # Assert that profile strings are of same lengths for i, individual in enumerate(profiles.keys()): if i == 0: length = len(profiles[individual]) assert len(profiles[individual]) == length
def load_profile(ctx, load, variant_file, update, stats, profile_threshold, check_vcf): """ Command for profiling of samples. User may upload variants used in profiling from a vcf, update the profiles for all samples, and get some stats from the profiles in the database. Profiling is used to monitor duplicates in the database. The profile is based on the variants in the 'profile_variant' collection, assessing the genotypes for each sample at the position of these variants. """ adapter = ctx.obj['adapter'] LOG.info("Running loqusdb profile") if check_vcf: LOG.info(f"Check if profile in {check_vcf} has match in database") vcf_file = check_vcf profiles = get_profiles(adapter, vcf_file) duplicate = check_duplicates(adapter, profiles, profile_threshold) if duplicate is not None: duplicate = json.dumps(duplicate) click.echo(duplicate) else: LOG.info("No duplicates found in the database") if load: genome_build = ctx.obj['genome_build'] vcf_path = MAF_PATH[genome_build] if variant_file is not None: vcf_path = variant_file LOG.info(f"Loads variants in {vcf_path} to be used in profiling") load_profile_variants(adapter, vcf_path) if update: LOG.info("Updates profiles in database") update_profiles(adapter) if stats: LOG.info("Prints profile stats") distance_dict = profile_stats(adapter, threshold=profile_threshold) click.echo(table_from_dict(distance_dict))
def load_database( adapter, variant_file=None, sv_file=None, family_file=None, family_type="ped", skip_case_id=False, gq_treshold=None, case_id=None, max_window=3000, profile_file=None, hard_threshold=0.95, soft_threshold=0.9, genome_build=None, ): """Load the database with a case and its variants Args: adapter: Connection to database variant_file(str): Path to variant file sv_file(str): Path to sv variant file family_file(str): Path to family file family_type(str): Format of family file skip_case_id(bool): If no case information should be added to variants gq_treshold(int): If only quality variants should be considered case_id(str): If different case id than the one in family file should be used max_window(int): Specify the max size for sv windows check_profile(bool): Does profile check if True hard_threshold(float): Rejects load if hamming distance above this is found soft_threshold(float): Stores similar samples if hamming distance above this is found Returns: nr_inserted(int) """ vcf_files = [] nr_variants = None vcf_individuals = None if variant_file: vcf_info = check_vcf(variant_file) nr_variants = vcf_info["nr_variants"] variant_type = vcf_info["variant_type"] vcf_files.append(variant_file) # Get the indivuduals that are present in vcf file vcf_individuals = vcf_info["individuals"] nr_sv_variants = None sv_individuals = None if sv_file: vcf_info = check_vcf(sv_file, "sv") nr_sv_variants = vcf_info["nr_variants"] vcf_files.append(sv_file) sv_individuals = vcf_info["individuals"] profiles = None matches = None if profile_file: profiles = get_profiles(adapter, profile_file) ###Check if any profile already exists matches = profile_match(adapter, profiles, hard_threshold=hard_threshold, soft_threshold=soft_threshold) # If a gq treshold is used the variants needs to have GQ for _vcf_file in vcf_files: # Get a cyvcf2.VCF object vcf = get_vcf(_vcf_file) if gq_treshold and not vcf.contains("GQ"): LOG.warning("Set gq-treshold to 0 or add info to vcf {0}".format( _vcf_file)) raise SyntaxError("GQ is not defined in vcf header") # Get a ped_parser.Family object from family file family = None family_id = None if family_file: LOG.info("Loading family from %s", family_file) with open(family_file, "r") as family_lines: family = get_case(family_lines=family_lines, family_type=family_type) family_id = family.family_id # There has to be a case_id or a family at this stage. case_id = case_id or family_id # Convert infromation to a loqusdb Case object case_obj = build_case( case=family, case_id=case_id, vcf_path=variant_file, vcf_individuals=vcf_individuals, nr_variants=nr_variants, vcf_sv_path=sv_file, sv_individuals=sv_individuals, nr_sv_variants=nr_sv_variants, profiles=profiles, matches=matches, profile_path=profile_file, ) # Build and load a new case, or update an existing one load_case( adapter=adapter, case_obj=case_obj, ) nr_inserted = 0 # If case was succesfully added we can store the variants for file_type in ["vcf_path", "vcf_sv_path"]: variant_type = "snv" if file_type == "vcf_sv_path": variant_type = "sv" if case_obj.get(file_type) is None: continue vcf_obj = get_vcf(case_obj[file_type]) try: nr_inserted += load_variants( adapter=adapter, vcf_obj=vcf_obj, case_obj=case_obj, skip_case_id=skip_case_id, gq_treshold=gq_treshold, max_window=max_window, variant_type=variant_type, genome_build=genome_build, ) except Exception as err: # If something went wrong do a rollback LOG.warning(err) delete( adapter=adapter, case_obj=case_obj, ) raise err return nr_inserted