def test_check_vcf_wrong_type(sv_vcf_path): ## GIVEN a sv vcf file ## WHEN collecting the VCF info with wrong variant type ## THEN assert that a VcfError is raised with pytest.raises(VcfError): vcf_info = check_vcf(sv_vcf_path, 'snv')
def test_check_sv_vcf(sv_vcf_path): ## GIVEN a vcf file and a counter that checks the number of variants true_nr = 0 with open(sv_vcf_path, "r") as f: for line in f: if not line.startswith("#"): true_nr += 1 ## WHEN collecting the VCF info vcf_info = check_vcf(sv_vcf_path, "sv") ## THEN assert that the number of variants collected is correct assert vcf_info["nr_variants"] == true_nr ## THEN assert that the variant type is correct assert vcf_info["variant_type"] == "sv"
def test_check_vcf_correct(vcf_path): ## GIVEN a vcf file and a counter that checks the number of variants true_nr = 0 with open(vcf_path, 'r') as f: for line in f: if not line.startswith('#'): true_nr += 1 ## WHEN collecting the VCF info vcf_info = check_vcf(vcf_path) ## THEN assert that the number of variants collected is correct assert vcf_info['nr_variants'] == true_nr ## THEN assert that the variant type is correct assert vcf_info['variant_type'] == 'snv'
def test_get_profiles(real_mongo_adapter, profile_vcf_path, zipped_vcf_path): # Load profile variants load_profile_variants(real_mongo_adapter, profile_vcf_path) vcf_info = check_vcf(zipped_vcf_path) # Get profiles from vcf profiles = get_profiles(real_mongo_adapter, zipped_vcf_path) # Assert that all individuals are included assert list(profiles.keys()) == vcf_info["individuals"] # Assert that profile strings are of same lengths for i, individual in enumerate(profiles.keys()): if i == 0: length = len(profiles[individual]) assert len(profiles[individual]) == length
def test_check_vcf(vcf_path): ## GIVEN the path to a vcf nr_variants = 0 vcf = VCF(vcf_path) inds = vcf.samples for var in vcf: nr_variants += 1 ## WHEN checking the vcf vcf_info = check_vcf(vcf_path) ## THEN assert that the number of variants is correct assert vcf_info['nr_variants'] == nr_variants ## THEN assert that the individuals are returned assert vcf_info['individuals'] == inds ## THEN assert that the variant type is correct assert vcf_info['variant_type'] == 'snv'
def load_profile_variants(adapter, variant_file): """ Loads variants used for profiling Args: adapter (loqusdb.plugins.Adapter): initialized plugin variant_file(str): Path to variant file """ vcf_info = check_vcf(variant_file) nr_variants = vcf_info["nr_variants"] variant_type = vcf_info["variant_type"] if variant_type != "snv": LOG.critical("Variants used for profiling must be SNVs only") raise VcfError vcf = get_vcf(variant_file) profile_variants = [build_profile_variant(variant) for variant in vcf] adapter.add_profile_variants(profile_variants)
def test_check_vcf_unsorted(unsorted_vcf_path): ## GIVEN a vcf file with unsorted variants ## WHEN checking the vcf ## THEN assert that the function raises a VcfError with pytest.raises(VcfError): check_vcf(unsorted_vcf_path)
def test_check_vcf_double_variant(double_vcf_path): ## GIVEN a variant file where a variant is duplicated ## WHEN checking the vcf ## THEN assert that the function raises a VcfError with pytest.raises(VcfError): check_vcf(double_vcf_path)
def load_database( adapter, variant_file=None, sv_file=None, family_file=None, family_type="ped", skip_case_id=False, gq_treshold=None, case_id=None, max_window=3000, profile_file=None, hard_threshold=0.95, soft_threshold=0.9, genome_build=None, ): """Load the database with a case and its variants Args: adapter: Connection to database variant_file(str): Path to variant file sv_file(str): Path to sv variant file family_file(str): Path to family file family_type(str): Format of family file skip_case_id(bool): If no case information should be added to variants gq_treshold(int): If only quality variants should be considered case_id(str): If different case id than the one in family file should be used max_window(int): Specify the max size for sv windows check_profile(bool): Does profile check if True hard_threshold(float): Rejects load if hamming distance above this is found soft_threshold(float): Stores similar samples if hamming distance above this is found Returns: nr_inserted(int) """ vcf_files = [] nr_variants = None vcf_individuals = None if variant_file: vcf_info = check_vcf(variant_file) nr_variants = vcf_info["nr_variants"] variant_type = vcf_info["variant_type"] vcf_files.append(variant_file) # Get the indivuduals that are present in vcf file vcf_individuals = vcf_info["individuals"] nr_sv_variants = None sv_individuals = None if sv_file: vcf_info = check_vcf(sv_file, "sv") nr_sv_variants = vcf_info["nr_variants"] vcf_files.append(sv_file) sv_individuals = vcf_info["individuals"] profiles = None matches = None if profile_file: profiles = get_profiles(adapter, profile_file) ###Check if any profile already exists matches = profile_match(adapter, profiles, hard_threshold=hard_threshold, soft_threshold=soft_threshold) # If a gq treshold is used the variants needs to have GQ for _vcf_file in vcf_files: # Get a cyvcf2.VCF object vcf = get_vcf(_vcf_file) if gq_treshold and not vcf.contains("GQ"): LOG.warning("Set gq-treshold to 0 or add info to vcf {0}".format( _vcf_file)) raise SyntaxError("GQ is not defined in vcf header") # Get a ped_parser.Family object from family file family = None family_id = None if family_file: LOG.info("Loading family from %s", family_file) with open(family_file, "r") as family_lines: family = get_case(family_lines=family_lines, family_type=family_type) family_id = family.family_id # There has to be a case_id or a family at this stage. case_id = case_id or family_id # Convert infromation to a loqusdb Case object case_obj = build_case( case=family, case_id=case_id, vcf_path=variant_file, vcf_individuals=vcf_individuals, nr_variants=nr_variants, vcf_sv_path=sv_file, sv_individuals=sv_individuals, nr_sv_variants=nr_sv_variants, profiles=profiles, matches=matches, profile_path=profile_file, ) # Build and load a new case, or update an existing one load_case( adapter=adapter, case_obj=case_obj, ) nr_inserted = 0 # If case was succesfully added we can store the variants for file_type in ["vcf_path", "vcf_sv_path"]: variant_type = "snv" if file_type == "vcf_sv_path": variant_type = "sv" if case_obj.get(file_type) is None: continue vcf_obj = get_vcf(case_obj[file_type]) try: nr_inserted += load_variants( adapter=adapter, vcf_obj=vcf_obj, case_obj=case_obj, skip_case_id=skip_case_id, gq_treshold=gq_treshold, max_window=max_window, variant_type=variant_type, genome_build=genome_build, ) except Exception as err: # If something went wrong do a rollback LOG.warning(err) delete( adapter=adapter, case_obj=case_obj, ) raise err return nr_inserted
def update_database( adapter, variant_file=None, sv_file=None, family_file=None, family_type="ped", skip_case_id=False, gq_treshold=None, case_id=None, max_window=3000, ): """Update a case in the database Args: adapter: Connection to database variant_file(str): Path to variant file sv_file(str): Path to sv variant file family_file(str): Path to family file family_type(str): Format of family file skip_case_id(bool): If no case information should be added to variants gq_treshold(int): If only quality variants should be considered case_id(str): If different case id than the one in family file should be used max_window(int): Specify the max size for sv windows Returns: nr_inserted(int) """ vcf_files = [] nr_variants = None vcf_individuals = None if variant_file: vcf_info = check_vcf(variant_file) nr_variants = vcf_info["nr_variants"] variant_type = vcf_info["variant_type"] vcf_files.append(variant_file) # Get the indivuduals that are present in vcf file vcf_individuals = vcf_info["individuals"] nr_sv_variants = None sv_individuals = None if sv_file: vcf_info = check_vcf(sv_file, "sv") nr_sv_variants = vcf_info["nr_variants"] vcf_files.append(sv_file) sv_individuals = vcf_info["individuals"] # If a gq treshold is used the variants needs to have GQ for _vcf_file in vcf_files: # Get a cyvcf2.VCF object vcf = get_vcf(_vcf_file) if gq_treshold: if not vcf.contains("GQ"): LOG.warning( "Set gq-treshold to 0 or add info to vcf {0}".format( _vcf_file)) raise SyntaxError("GQ is not defined in vcf header") # Get a ped_parser.Family object from family file family = None family_id = None if family_file: with open(family_file, "r") as family_lines: family = get_case(family_lines=family_lines, family_type=family_type) family_id = family.family_id # There has to be a case_id or a family at this stage. case_id = case_id or family_id # Convert infromation to a loqusdb Case object case_obj = build_case( case=family, case_id=case_id, vcf_path=variant_file, vcf_individuals=vcf_individuals, nr_variants=nr_variants, vcf_sv_path=sv_file, sv_individuals=sv_individuals, nr_sv_variants=nr_sv_variants, ) existing_case = adapter.case(case_obj) if not existing_case: raise CaseError("Case {} does not exist in database".format( case_obj["case_id"])) # Update the existing case in database case_obj = load_case( adapter=adapter, case_obj=case_obj, update=True, ) nr_inserted = 0 # If case was succesfully added we can store the variants for file_type in ["vcf_path", "vcf_sv_path"]: variant_type = "snv" if file_type == "vcf_sv_path": variant_type = "sv" if case_obj.get(file_type) is None: continue vcf_obj = get_vcf(case_obj[file_type]) try: nr_inserted += load_variants( adapter=adapter, vcf_obj=vcf_obj, case_obj=case_obj, skip_case_id=skip_case_id, gq_treshold=gq_treshold, max_window=max_window, variant_type=variant_type, ) except Exception as err: # If something went wrong do a rollback LOG.warning(err) delete( adapter=adapter, case_obj=case_obj, update=True, existing_case=existing_case, ) raise err return nr_inserted