def test_debug_option(self): """ test whether we can set up the class with the debug option """ counter = 0 total_trios = 1 known_genes = {} self.vcf_loader = LoadVCFs(total_trios, known_genes, "1", "10000") # check that the debug filter function got set correctly self.assertEqual(SNV.passes_filters, SNV.passes_filters_with_debug)
def setUp(self): """ define a default LoadVCFs object """ total_trios = 1 known_genes = {"ATRX": {"inheritance": {"Hemizygous": \ {"Loss of function"}}, "start": 1, "chrom": "1", \ "confirmed_status": {"Confirmed DD Gene"}, "end": 20000000}} self.vcf_loader = LoadVCFs(total_trios, known_genes, None, None) # make a temp directory for the cache file self.temp_dir = tempfile.mkdtemp()
def test_debug_option(self): """ test whether we can set up the class with the debug option """ total_trios = 1 known_genes = {} maf_tags = None # if the debug info isn't available, then the SNV object doesn't use the # debug filter function self.vcf_loader = LoadVCFs(total_trios, maf_tags, known_genes, set(), None, None) self.assertNotEqual(SNV.passes_filters, SNV.passes_filters_with_debug) # if the debug info is passed in, check that the debug filter function # got set correctly self.vcf_loader = LoadVCFs(total_trios, maf_tags, known_genes, set(), "1", "10000") self.assertEqual(SNV.passes_filters, SNV.passes_filters_with_debug)
def setUp(self): """ define a default LoadVCFs object """ total_trios = 1 maf_tags = ["AFR_AF", "AMR_AF", "ASN_AF", "DDD_AF", "EAS_AF", "ESP_AF", "EUR_AF", "MAX_AF", "SAS_AF", "UK10K_cohort_AF"] self.known_genes = {"ATRX": {"inheritance": {"Hemizygous": \ {"Loss of function"}}, "start": 1, "chrom": "1", \ "confirmed_status": {"confirmed dd gene"}, "end": 20000000}} self.vcf_loader = LoadVCFs(total_trios, maf_tags, self.known_genes, set(), None, None, )
def filter_trios(self): """ loads trio variants, and screens for candidate variants """ self.vcf_loader = LoadVCFs(len(self.families), self.known_genes, \ self.debug_chrom, self.debug_pos) # load the trio paths into the current path setup for family_ID in sorted(self.families): self.family = self.families[family_ID] # some families have more than one child in the family, so run # through each child. self.family.set_child() while self.family.child is not None: if self.family.child.is_affected(): variants = self.vcf_loader.get_trio_variants( self.family, self.pp_filter) self.vcf_provenance = self.vcf_loader.get_trio_provenance() self.analyse_trio(variants) self.family.set_child_examined() sys.exit(0)
def filter_trios(self): """ loads trio variants, and screens for candidate variants """ self.vcf_loader = LoadVCFs(len(self.families), self.known_genes, self.debug_chrom, self.debug_pos) # load the trio paths into the current path setup for family_ID in sorted(self.families): self.family = self.families[family_ID] # some families have more than one child in the family, so run # through each child. self.family.set_child() while self.family.child is not None: if self.family.child.is_affected(): variants = self.vcf_loader.get_trio_variants(self.family, self.pp_filter) self.vcf_provenance = self.vcf_loader.get_trio_provenance() self.analyse_trio(variants) self.family.set_child_examined() sys.exit(0)
class ClinicalFilter(LoadOptions): """ filters trios for candidate variants that might contribute to a probands disorder. """ def __init__(self, opts): """intialise the class with the some definitions """ self.set_definitions(opts) self.report = Report(self.output_path, self.export_vcf, self.ID_mapper, self.known_genes_date) def filter_trios(self): """ loads trio variants, and screens for candidate variants """ self.vcf_loader = LoadVCFs(len(self.families), self.known_genes, \ self.debug_chrom, self.debug_pos) # load the trio paths into the current path setup for family_ID in sorted(self.families): self.family = self.families[family_ID] # some families have more than one child in the family, so run # through each child. self.family.set_child() while self.family.child is not None: if self.family.child.is_affected(): variants = self.vcf_loader.get_trio_variants( self.family, self.pp_filter) self.vcf_provenance = self.vcf_loader.get_trio_provenance() self.analyse_trio(variants) self.family.set_child_examined() sys.exit(0) def analyse_trio(self, variants): """identify candidate variants in exome data for a single trio. takes variants that passed the initial filtering from VCF loading, and splits the variants into groups for each gene with variants. Then analyses variants in a single gene (so we can utilise the appropriate inheritance mechanisms for that gene), before running some pos-inheritance filters, and exporting the data (ir required). Args: variants: list of TrioGenotypes objects """ # organise variants by gene, then find variants that fit # different inheritance models genes_dict = self.create_gene_dict(variants) found_vars = [] for gene in genes_dict: gene_vars = genes_dict[gene] found_vars += self.find_variants(gene_vars, gene) # remove any duplicate variants (which might ocur due to CNVs being # checked against all the genes that they encompass) found_vars = self.exclude_duplicates(found_vars) # apply some final filters to the flagged variants post_filter = PostInheritanceFilter(found_vars, self.family, self.debug_chrom, self.debug_pos) found_vars = post_filter.filter_variants() # export the results to either tab-separated table or VCF format self.report.export_data(found_vars, self.family, \ self.vcf_loader.child_header, self.vcf_provenance) def create_gene_dict(self, variants): """creates dictionary of variants indexed by gene Args: variants: list of TrioGenotypes objects Returns: dictionary of variants indexed by HGNC symbols """ # organise the variants into entries for each gene genes = {} for var in variants: # variants (particularly CNVs) can span multiple genes, so we need # to check each gene separately, and then collapse duplicates later for gene in var.get_genes(): if gene not in genes: genes[gene] = [] # add the variant to the gene entry genes[gene].append(var) return genes def find_variants(self, variants, gene): """ finds variants that fit inheritance models Args: variants: list of TrioGenotype objects gene: gene ID as string Returns: list of variants that pass inheritance checks """ # get the inheritance for the gene (monoalleleic, biallelic, hemizygous # etc), but allow for times when we haven't specified a list of genes # to use gene_inh = None if self.known_genes is not None and gene in self.known_genes: gene_inh = self.known_genes[gene]["inh"] # If we are looking for variants in a set of known genes, and the gene # isn't part of that set, then we don't ant to examine the variant for # that gene, UNLESS the variant is a CNV, since CNVs can be included # purely from size thresholds, regardless of which gene they overlap. if self.known_genes is not None and gene not in self.known_genes: variants = [x for x in variants if x.is_cnv()] # ignore intergenic variants if gene is None: for var in variants: if var.get_chrom() == self.debug_chrom and var.get_position( ) == self.debug_pos: print(var, "lacks HGNC/gene symbol") return [] # Now that we are examining a single gene, check that the consequences # for the gene are in the required functional categories. variants = [ var for var in variants if var.child.is_lof(gene) or var.child.is_missense(gene) ] if variants == []: return [] logging.debug("{} {} {} {}".format(self.family.child.get_id(), gene, variants, gene_inh)) chrom_inheritance = variants[0].get_inheritance_type() if chrom_inheritance == "autosomal": finder = Autosomal(variants, self.family, self.known_genes, gene, self.cnv_regions) elif chrom_inheritance in ["XChrMale", "XChrFemale", "YChrMale"]: finder = Allosomal(variants, self.family, self.known_genes, gene, self.cnv_regions) variants = finder.get_candidate_variants() variants = [(x[0], list(x[1]), list(x[2]), [gene]) for x in variants] return variants def exclude_duplicates(self, variants): """ rejig variants included under multiple inheritance mechanisms Args: variants: list of candidate variants Returns: list of (variant, check_type, inheritance) tuples, with duplicates excluded, and originals modified to show both mechanisms """ unique_vars = {} for variant in variants: key = variant[0].child.get_key() if key not in unique_vars: unique_vars[key] = list(variant) else: result = variant[1] inh = variant[2] hgnc = variant[3] # append the check type and inheritance type to the first # instance of the variant unique_vars[key][1] += [ x for x in result if x not in unique_vars[key][1] ] unique_vars[key][2] += [ x for x in inh if x not in unique_vars[key][2] ] # add the HGNC symbols that are unique to the current variant # to the merged variant hgnc = [x for x in hgnc if x not in unique_vars[key][3]] unique_vars[key][3] += hgnc unique_vars = [tuple(unique_vars[x]) for x in unique_vars] return unique_vars
class ClinicalFilter(LoadOptions): """ filters trios for candidate variants that might contribute to a probands disorder. """ def __init__(self, opts): """intialise the class with the some definitions """ self.set_definitions(opts) self.report = Report(self.output_path, self.export_vcf, self.ID_mapper, self.known_genes_date) def filter_trios(self): """ loads trio variants, and screens for candidate variants """ self.vcf_loader = LoadVCFs(len(self.families), self.known_genes, self.debug_chrom, self.debug_pos) # load the trio paths into the current path setup for family_ID in sorted(self.families): self.family = self.families[family_ID] # some families have more than one child in the family, so run # through each child. self.family.set_child() while self.family.child is not None: if self.family.child.is_affected(): variants = self.vcf_loader.get_trio_variants(self.family, self.pp_filter) self.vcf_provenance = self.vcf_loader.get_trio_provenance() self.analyse_trio(variants) self.family.set_child_examined() sys.exit(0) def analyse_trio(self, variants): """identify candidate variants in exome data for a single trio. takes variants that passed the initial filtering from VCF loading, and splits the variants into groups for each gene with variants. Then analyses variants in a single gene (so we can utilise the appropriate inheritance mechanisms for that gene), before running some pos-inheritance filters, and exporting the data (ir required). Args: variants: list of TrioGenotypes objects """ # organise variants by gene, then find variants that fit # different inheritance models genes_dict = self.create_gene_dict(variants) found_vars = [] for gene in genes_dict: gene_vars = genes_dict[gene] found_vars += self.find_variants(gene_vars, gene) # remove any duplicate variants (which might ocur due to CNVs being # checked against all the genes that they encompass) found_vars = self.exclude_duplicates(found_vars) # apply some final filters to the flagged variants post_filter = PostInheritanceFilter(found_vars, self.family, self.debug_chrom, self.debug_pos) found_vars = post_filter.filter_variants() # export the results to either tab-separated table or VCF format self.report.export_data(found_vars, self.family, self.vcf_loader.child_header, self.vcf_provenance) def create_gene_dict(self, variants): """creates dictionary of variants indexed by gene Args: variants: list of TrioGenotypes objects Returns: dictionary of variants indexed by HGNC symbols """ # organise the variants into entries for each gene genes = {} for var in variants: # variants (particularly CNVs) can span multiple genes, so we need # to check each gene separately, and then collapse duplicates later for gene in var.get_genes(): if gene not in genes: genes[gene] = [] # add the variant to the gene entry genes[gene].append(var) return genes def find_variants(self, variants, gene): """ finds variants that fit inheritance models Args: variants: list of TrioGenotype objects gene: gene ID as string Returns: list of variants that pass inheritance checks """ # get the inheritance for the gene (monoalleleic, biallelic, hemizygous # etc), but allow for times when we haven't specified a list of genes # to use gene_inh = None if self.known_genes is not None and gene in self.known_genes: gene_inh = self.known_genes[gene]["inh"] # If we are looking for variants in a set of known genes, and the gene # isn't part of that set, then we don't ant to examine the variant for # that gene, UNLESS the variant is a CNV, since CNVs can be included # purely from size thresholds, regardless of which gene they overlap. if self.known_genes is not None and gene not in self.known_genes: variants = [x for x in variants if x.is_cnv()] # ignore intergenic variants if gene is None: for var in variants: if var.get_chrom() == self.debug_chrom and var.get_position() == self.debug_pos: print(var, "lacks HGNC/gene symbol") return [] # Now that we are examining a single gene, check that the consequences # for the gene are in the required functional categories. variants = [var for var in variants if var.child.is_lof(gene) or var.child.is_missense(gene)] if variants == []: return [] logging.debug("{} {} {} {}".format(self.family.child.get_id(), gene, variants, gene_inh)) chrom_inheritance = variants[0].get_inheritance_type() if chrom_inheritance == "autosomal": finder = Autosomal(variants, self.family, self.known_genes, gene, self.cnv_regions) elif chrom_inheritance in ["XChrMale", "XChrFemale", "YChrMale"]: finder = Allosomal(variants, self.family, self.known_genes, gene, self.cnv_regions) variants = finder.get_candidate_variants() variants = [(x[0], list(x[1]), list(x[2]), [gene]) for x in variants] return variants def exclude_duplicates(self, variants): """ rejig variants included under multiple inheritance mechanisms Args: variants: list of candidate variants Returns: list of (variant, check_type, inheritance) tuples, with duplicates excluded, and originals modified to show both mechanisms """ unique_vars = {} for variant in variants: key = variant[0].child.get_key() if key not in unique_vars: unique_vars[key] = list(variant) else: result = variant[1] inh = variant[2] hgnc = variant[3] # append the check type and inheritance type to the first # instance of the variant unique_vars[key][1] += [x for x in result if x not in unique_vars[key][1]] unique_vars[key][2] += [x for x in inh if x not in unique_vars[key][2]] # add the HGNC symbols that are unique to the current variant # to the merged variant hgnc = [x for x in hgnc if x not in unique_vars[key][3]] unique_vars[key][3] += hgnc unique_vars = [tuple(unique_vars[x]) for x in unique_vars] return unique_vars
class TestLoadVCFsPy(unittest.TestCase): """ """ def setUp(self): """ define a default LoadVCFs object """ total_trios = 1 known_genes = {"ATRX": {"inheritance": {"Hemizygous": \ {"Loss of function"}}, "start": 1, "chrom": "1", \ "confirmed_status": {"Confirmed DD Gene"}, "end": 20000000}} self.vcf_loader = LoadVCFs(total_trios, known_genes, None, None) # make a temp directory for the cache file self.temp_dir = tempfile.mkdtemp() def tearDown(self): """ remove the temp directory once a test completes """ shutil.rmtree(self.temp_dir) def make_minimal_vcf(self): """ construct the bare minimum of lines for a VCF file """ header = [] header.append("##fileformat=VCFv4.1\n") header.append("##fileDate=2014-01-01\n") header.append( "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n") header.append( "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample_id\n" ) variants = [] variants.append("1\t100\t.\tT\tA\t1000\tPASS\t.\tGT\t0/1\n") variants.append("1\t200\t.\tT\tA\t1000\tPASS\t.\tGT\t0/1\n") vcf = header + variants return vcf def write_temp_vcf(self, filename, vcf_data): """ writes data to a file, and returns the full path to the file """ full_path = os.path.join(self.temp_dir, filename) vcf_data = "".join(vcf_data) output = open(full_path, "w") output.write(vcf_data) output.close() return full_path def write_gzipped_vcf(self, filename, vcf_data): """ writes data to a gzip file, and returns the full path to the file """ full_path = os.path.join(self.temp_dir, filename) vcf_data = "".join(vcf_data) if IS_PYTHON2: f = gzip.open(full_path, 'wb') elif IS_PYTHON3: f = gzip.open(full_path, 'wt') f.write(vcf_data) f.close() return full_path def test_open_vcf_file(self): """ test obtaining a file handle for the VCF """ vcf = self.make_minimal_vcf() path = self.write_temp_vcf("temp.vcf", vcf) # check that plain VCF files can be loaded handle = self.vcf_loader.open_vcf_file(path) self.assertEqual(type(handle), io.TextIOWrapper) handle.close() # check that gzipped vcf files are handled correctly path = self.write_gzipped_vcf("temp.vcf.gz", vcf) handle = self.vcf_loader.open_vcf_file(path) if IS_PYTHON2: self.assertEqual(type(handle), gzip.GzipFile) elif IS_PYTHON3: self.assertEqual(type(handle), io.TextIOWrapper) handle.close() # make sure files that don't exists raise an error path = os.path.join(self.temp_dir, "zzz.txt") with self.assertRaises(OSError): self.vcf_loader.open_vcf_file(path) # check that files with unknown extensions raise errors path = self.write_temp_vcf("temp.zzz", vcf) with self.assertRaises(OSError): self.vcf_loader.open_vcf_file(path) def test_get_vcf_header(self): """ test that get_vcf_header() works correctly """ vcf = self.make_minimal_vcf() path = self.write_temp_vcf("temp.vcf", vcf) header = self.vcf_loader.get_vcf_header(path) # check that the header is returned correctly self.assertEqual(header, vcf[:4]) def test_exclude_header(self): """ test that exclude_header() works correctly """ vcf = self.make_minimal_vcf() # make sure we drop the header, and only the header from the file # check this by reading the file, and making sure the first line # is the line we expect from the VCF path = self.write_temp_vcf("temp.vcf", vcf) handler = open(path, "r") self.vcf_loader.exclude_header(handler) self.assertEqual(handler.readline(), vcf[4]) handler.close() # also check for gzipped VCF files. path = self.write_gzipped_vcf("temp.vcf.gz", vcf) if IS_PYTHON2: handler = gzip.open(path, "r") elif IS_PYTHON3: handler = gzip.open(path, "rt") self.vcf_loader.exclude_header(handler) self.assertEqual(handler.readline(), vcf[4]) handler.close() def test_add_single_variant(self): """ test that add_single_variant() works correctly """ # the sub-functions are all tested elsewhere, this test merely checks # that valid variants are added to the variants list, and invalid # variants are passed over without being added to the variants list # set up an autosomal variant line = ["1", "100", ".", "T", "G", "1000", "PASS", ".", "GT", "0/1"] gender = "M" variant = SNV(*line[:6]) # check that the variant is added to the variant list variants = [] self.vcf_loader.add_single_variant(variants, variant, gender, line) self.assertEqual(variants, [variant]) # set up an X-chrom male het line = ["X", "100", ".", "T", "G", "1000", "PASS", ".", "GT", "0/1"] variant = SNV(*line[:6]) # check that the X-chrom male het is not added to the variant list variants = [] self.vcf_loader.add_single_variant(variants, variant, gender, line) self.assertEqual(variants, []) def test_get_vcf_provenance(self): """ test that get_vcf_provenance() works correctly """ vcf = self.make_minimal_vcf() vcf_string = "".join(vcf) if IS_PYTHON3: vcf_string = vcf_string.encode("utf-8") ungzipped_hash = hashlib.sha1(vcf_string).hexdigest() header = vcf[:4] path = self.write_temp_vcf("temp.vcf", vcf) # check that the file defs return correctly (checksum, basename, date) = self.vcf_loader.get_vcf_provenance(path) self.assertEqual(checksum, ungzipped_hash) self.assertEqual(basename, "temp.vcf") self.assertEqual(date, "2014-01-01") # now write a gzip file, and check that we get the correct hash path = self.write_gzipped_vcf("test.vcf.gz", vcf) handle = open(path, "rb") gzipped_hash = hashlib.sha1(handle.read()).hexdigest() handle.close() (checksum, basename, date) = self.vcf_loader.get_vcf_provenance(path) self.assertEqual(checksum, gzipped_hash) # check that when a fileDate isn't available in the VCf, we can pick # the date from the path vcf.pop(1) path = self.write_temp_vcf("temp.file_process.2014-02-20.vcf", vcf) (checksum, basename, date) = self.vcf_loader.get_vcf_provenance(path) self.assertEqual(date, "2014-02-20") def test_construct_variant(self): """ test that construct_variant() works correctly """ # check that construct variant works for SNVs line = ["1", "100", ".", "T", "G", "1000", "PASS", ".", "GT", "0/1"] gender = "M" test_var = SNV(*line[:6]) variant = self.vcf_loader.construct_variant(line, gender) self.assertEqual(variant.get_key(), test_var.get_key()) # initally constructing a SNV shouldn't affect the format variable self.assertEqual(variant.format, None) # check that construct variant works for CNVs line = [ "1", "100", ".", "T", "<DEL>", "1000", "PASS", "END=200", "GT", "0/1" ] gender = "M" test_var = CNV(*line[:6]) test_var.add_info(line[7]) variant = self.vcf_loader.construct_variant(line, gender) self.assertEqual(variant.get_key(), test_var.get_key()) self.assertNotEqual(variant.format, None) # TODO: add checks for when HGNC is in the the filters def test_include_variant(self): """ check that include_variant() works correctly """ child_variants = False gender = "M" # make a child var which passes the filters line = [ "1", "100", ".", "T", "A", "1000", "PASS", "CQ=missense_variant;HGNC=ATRX", "GT", "0/1" ] self.assertTrue( self.vcf_loader.include_variant(line, child_variants, gender)) # make a child var that fails the filters, which should return False line = [ "1", "100", ".", "T", "A", "1000", "FAIL", "CQ=missense_variant;HGNC=ATRX", "GT", "0/1" ] self.assertFalse( self.vcf_loader.include_variant(line, child_variants, gender)) # now check for parents variants child_variants = True # check a parents var, where we have a matching child var self.vcf_loader.child_keys = set([("1", 100), ("X", 200)]) line = [ "1", "100", ".", "T", "A", "1000", "FAIL", "CQ=missense_variant;HGNC=ATRX", "GT", "0/1" ] self.assertTrue( self.vcf_loader.include_variant(line, child_variants, gender)) # check a parents var, where we don't have a matching child var line = [ "1", "200", ".", "T", "A", "1000", "FAIL", "CQ=missense_variant;HGNC=ATRX", "GT", "0/1" ] self.assertFalse( self.vcf_loader.include_variant(line, child_variants, gender)) # and check parental CNVs line = [ "1", "100", ".", "T", "<DEL>", "1000", "PASS", "END=200", "GT", "0/1" ] gender = "M" test_var = CNV(*line[:6]) test_var.add_info(line[7]) # in this function we look for overlap in CNVs. Set up a child CNV # that the parents CNV must match. self.vcf_loader.cnv_matcher = MatchCNVs([test_var]) self.assertTrue( self.vcf_loader.include_variant(line, child_variants, gender)) # check that a parental CNV without any overlap to any childs CNVs, # fails to pass line = [ "1", "300", ".", "T", "<DEL>", "1000", "PASS", "END=400", "GT", "0/1" ] gender = "M" self.assertFalse( self.vcf_loader.include_variant(line, child_variants, gender)) def test_filter_de_novos(self): """ check that filter_de_novos() works correctly """ # make a family without parents family = Family("fam_id") child_gender = "female" family.add_child("child_id", "child_vcf_path", "2", child_gender) self.vcf_loader.family = family # set up an autosomal variant line = ["1", "100", ".", "T", "G", "1000", "PASS", ".", "GT", "0/1"] gender = "M" child_var = SNV(*line[:6]) child_var.add_info(line[7]) child_var.add_format(line[8], line[9]) child_var.set_gender(child_gender) child_var.set_genotype() # combine the variant into a list of TrioGenotypes child_vars = [child_var] mother_vars = [] father_vars = [] trio_variants = self.vcf_loader.combine_trio_variants( child_vars, mother_vars, father_vars) # check that vars without parents get passed through automatically self.assertEqual(self.vcf_loader.filter_de_novos(trio_variants, 0.9), trio_variants) # now add parents to the family family.add_mother("mother_id", "mother_vcf_path", "1", "female") family.add_father("father_id", "father_vcf_path", "1", "male") # re-generate the variants list now that parents have been included trio_variants = self.vcf_loader.combine_trio_variants( child_vars, mother_vars, father_vars) # check that vars with parents, and that appear to be de novo are # filtered out self.assertEqual(self.vcf_loader.filter_de_novos(trio_variants, 0.9), []) # check that vars with parents, but which are not de novo, are retained mother_vars = child_vars trio_variants = self.vcf_loader.combine_trio_variants( child_vars, mother_vars, father_vars) self.assertEqual(self.vcf_loader.filter_de_novos(trio_variants, 0.9), trio_variants) def test_debug_option(self): """ test whether we can set up the class with the debug option """ counter = 0 total_trios = 1 known_genes = {} self.vcf_loader = LoadVCFs(total_trios, known_genes, "1", "10000") # check that the debug filter function got set correctly self.assertEqual(SNV.passes_filters, SNV.passes_filters_with_debug)
class TestLoadVCFsPy(unittest.TestCase): """ test that the LoadVCFs methods work as expected """ @classmethod def setUpClass(cls): cls.temp_dir = tempfile.mkdtemp() @classmethod def tearDownClass(cls): shutil.rmtree(cls.temp_dir) def setUp(self): """ define a default LoadVCFs object """ total_trios = 1 maf_tags = ["AFR_AF", "AMR_AF", "ASN_AF", "DDD_AF", "EAS_AF", "ESP_AF", "EUR_AF", "MAX_AF", "SAS_AF", "UK10K_cohort_AF"] self.known_genes = {"ATRX": {"inheritance": {"Hemizygous": \ {"Loss of function"}}, "start": 1, "chrom": "1", \ "confirmed_status": {"confirmed dd gene"}, "end": 20000000}} self.vcf_loader = LoadVCFs(total_trios, maf_tags, self.known_genes, set(), None, None, ) def write_temp_vcf(self, path, vcf_data): """ writes data to a file """ with open(path, 'w') as handle: handle.writelines(vcf_data) def write_gzipped_vcf(self, path, lines): ''' write, compress, and index lines for a VCF ''' with tempfile.NamedTemporaryFile(dir=self.temp_dir) as handle: for x in lines: handle.write(x.encode('utf8')) handle.flush() # assume bgzip and tabix binaries are available, this should be # handled by travis-ci setup. with open(path, 'w') as output: subprocess.call(['bgzip', '-c', handle.name], stdout=output) subprocess.call(['tabix', '-f', '-p', 'vcf', path]) def test_open_vcf(self): """ test obtaining a file handle for the VCF """ vcf = make_minimal_vcf() path = os.path.join(self.temp_dir, "temp.vcf") self.write_temp_vcf(path, vcf) # check that plain VCF files can be loaded handle = open_vcf(path) self.assertEqual(type(handle), io.TextIOWrapper) handle.close() # check that gzipped vcf files are handled correctly path = os.path.join(self.temp_dir, "temp.vcf.gz") self.write_gzipped_vcf(path, vcf) handle = open_vcf(path) if IS_PYTHON3: self.assertEqual(type(handle), io.TextIOWrapper) else: self.assertEqual(type(handle), gzip.GzipFile) handle.close() # make sure files that don't exists raise an error path = os.path.join(self.temp_dir, "zzz.txt") with self.assertRaises(OSError): open_vcf(path) # check that files with unknown extensions raise errors path = os.path.join(self.temp_dir, "temp.zzz") self.write_temp_vcf(path, vcf) with self.assertRaises(OSError): open_vcf(path) def test_get_vcf_header(self): """ test that get_vcf_header() works correctly """ vcf = make_minimal_vcf() path = os.path.join(self.temp_dir, "temp.vcf") self.write_temp_vcf(path, vcf) header = get_vcf_header(path) # check that the header is returned correctly self.assertEqual(header, vcf[:4]) def test_exclude_header(self): """ test that exclude_header() works correctly """ vcf = make_minimal_vcf() # make sure we drop the header, and only the header from the file # check this by reading the file, and making sure the first line # is the line we expect from the VCF path = os.path.join(self.temp_dir, "temp.vcf") self.write_temp_vcf(path, vcf) handler = open(path, "r") exclude_header(handler) self.assertEqual(handler.readline(), vcf[4]) handler.close() # also check for gzipped VCF files. path = os.path.join(self.temp_dir, "temp.vcf.gz") self.write_gzipped_vcf(path, vcf) mode = 'r' if IS_PYTHON3: mode = 'rt' with gzip.open(path, mode) as handler: exclude_header(handler) self.assertEqual(handler.readline(), vcf[4]) def test_add_single_variant(self): """ test that add_single_variant() works correctly """ # the sub-functions are all tested elsewhere, this test merely checks # that valid variants are added to the variants list, and invalid # variants are passed over without being added to the variants list # set up an autosomal variant line = ["1", "100", ".", "T", "G", "1000", "PASS", ".", "GT", "0/1"] gender = "M" variant = SNV(*line[:6]) # check that the variant is added to the variant list variants = [] self.vcf_loader.add_single_variant(variants, variant, gender, line) self.assertEqual(variants, [variant]) # set up an X-chrom male het line = ["X", "100", ".", "T", "G", "1000", "PASS", ".", "GT", "0/1"] variant = SNV(*line[:6]) # check that the X-chrom male het is not added to the variant list variants = [] self.vcf_loader.add_single_variant(variants, variant, gender, line) self.assertEqual(variants, []) def test_get_vcf_provenance(self): """ test that get_vcf_provenance() works correctly """ path = os.path.join(self.temp_dir, "temp.vcf") gz_path = os.path.join(self.temp_dir, "temp.vcf.gz") date_path = os.path.join(self.temp_dir, "temp.process.2014-02-20.vcf") family = Family('famid') family.add_child('child_id', 'mother', 'father', 'f', '2', path) family.add_mother('mom_id', '0', '0', 'female', '1', gz_path) family.add_father('dad_id', '0', '0', 'male', '1', date_path) family.set_child() vcf = make_minimal_vcf() vcf_string = "".join(vcf) if IS_PYTHON3: vcf_string = vcf_string.encode("utf-8") ungzipped_hash = hashlib.sha1(vcf_string).hexdigest() header = vcf[:4] self.write_temp_vcf(path, vcf) # check that the file defs return correctly (checksum, basename, date) = get_vcf_provenance(family.child) self.assertEqual(checksum, ungzipped_hash) self.assertEqual(basename, "temp.vcf") self.assertEqual(date, "2014-01-01") # now write a gzip file, and check that we get the correct hash self.write_gzipped_vcf(gz_path, vcf) handle = open(gz_path, "rb") gzipped_hash = hashlib.sha1(handle.read()).hexdigest() handle.close() (checksum, basename, date) = get_vcf_provenance(family.mother) self.assertEqual(checksum, gzipped_hash) # check that when a fileDate isn't available in the VCF, we can pick # the date from the path vcf.pop(1) self.write_temp_vcf(date_path, vcf) (checksum, basename, date) = get_vcf_provenance(family.father) self.assertEqual(date, "2014-02-20") # and check we get null values if the family member is not present family.father = None provenance = get_vcf_provenance(family.father) self.assertEqual(provenance, ('NA', 'NA', 'NA')) def test_construct_variant(self): """ test that construct_variant() works correctly """ # check that construct variant works for SNVs line = ["1", "100", ".", "T", "G", "1000", "PASS", ".", "GT", "0/1"] gender = "M" test_var = SNV(*line[:6]) variant = construct_variant(line, gender, self.known_genes) self.assertEqual(variant.get_key(), test_var.get_key()) # initally constructing a SNV shouldn't affect the format variable self.assertEqual(variant.format, None) # check that construct variant works for CNVs line = ["1", "100", ".", "T", "<DEL>", "1000", "PASS", "END=200", "GT", "0/1"] gender = "M" test_var = CNV(*line[:6]) test_var.add_info(line[7]) variant = construct_variant(line, gender, self.known_genes) self.assertEqual(variant.get_key(), test_var.get_key()) self.assertNotEqual(variant.format, None) # TODO: add checks for when HGNC is in the the filters def test_include_variant(self): """ check that include_variant() works correctly """ mnvs = {} child_variants = False gender = "M" # make a child var which passes the filters line = ["1", "100", ".", "T", "A", "1000", "PASS", "CQ=missense_variant;HGNC=ATRX", "GT", "0/1"] self.assertTrue(self.vcf_loader.include_variant(line, child_variants, gender, mnvs)) # make a child var that fails the filters, which should return False line = ["1", "100", ".", "T", "A", "1000", "FAIL", "CQ=missense_variant;HGNC=ATRX", "GT", "0/1"] self.assertFalse(self.vcf_loader.include_variant(line, child_variants, gender, mnvs)) # now check for parents variants child_variants = True # check a parents var, where we have a matching child var self.vcf_loader.child_keys = set([("1", 100), ("X", 200)]) line = ["1", "100", ".", "T", "A", "1000", "FAIL", "CQ=missense_variant;HGNC=ATRX", "GT", "0/1"] self.assertTrue(self.vcf_loader.include_variant(line, child_variants, gender, mnvs)) # check a parents var, where we don't have a matching child var line = ["1", "200", ".", "T", "A", "1000", "FAIL", "CQ=missense_variant;HGNC=ATRX", "GT", "0/1"] self.assertFalse(self.vcf_loader.include_variant(line, child_variants, gender, mnvs)) # and check parental CNVs line = ["1", "100", ".", "T", "<DEL>", "1000", "PASS", "END=200", "GT", "0/1"] gender = "M" test_var = CNV(*line[:6]) test_var.add_info(line[7]) # in this function we look for overlap in CNVs. Set up a child CNV # that the parents CNV must match. self.assertTrue(self.vcf_loader.include_variant(line, child_variants, gender, mnvs)) # check that a parental CNV without any overlap to any childs CNVs, # fails to pass line = ["1", "300", ".", "T", "<DEL>", "1000", "PASS", "END=400", "GT", "0/1"] gender = "M" self.assertFalse(self.vcf_loader.include_variant(line, child_variants, gender, mnvs)) def test_open_individual(self): ''' test that open_individual() works correctly ''' # missing individual returns empty list self.assertEqual(self.vcf_loader.open_individual(None), []) vcf = make_vcf_header() vcf.append(make_vcf_line(pos=1, extra='HGNC=TEST;MAX_AF=0.0001')) vcf.append(make_vcf_line(pos=2, extra='HGNC=ATRX;MAX_AF=0.0001')) path = os.path.join(self.temp_dir, "temp.vcf") self.write_temp_vcf(path, vcf) person = Person('fam_id', 'sample', 'dad', 'mom', 'F', '2', path) var1 = SNV(chrom="1", position=1, id=".", ref="G", alts="T", filter="PASS", info="CQ=missense_variant;HGNC=TEST;MAX_AF=0.0001", format="DP:GT", sample="50:0/1", gender="female", mnv_code=None) var2 = SNV(chrom="1", position=2, id=".", ref="G", alts="T", filter="PASS", info="CQ=missense_variant;HGNC=ATRX;MAX_AF=0.0001", format="DP:GT", sample="50:0/1", gender="female", mnv_code=None) self.assertEqual(self.vcf_loader.open_individual(person), [var2]) # define a set of variants to automatically pass, and check that these # variants pass. self.vcf_loader.child_keys = set([('1', 1), ('1', 2)]) self.assertEqual(self.vcf_loader.open_individual(person, child_variants=True), [var1, var2]) def test_open_individual_with_mnvs(self): ''' test that open_individual works with MNVs ''' vcf = make_vcf_header() vcf.append(make_vcf_line(pos=1, cq='splice_region_variant', extra='HGNC=ATRX;MAX_AF=0.0001')) vcf.append(make_vcf_line(pos=2, cq='missense_variant', extra='HGNC=ATRX;MAX_AF=0.0001')) path = os.path.join(self.temp_dir, "temp.vcf.gz") self.write_gzipped_vcf(path, vcf) person = Person('fam_id', 'sample', 'dad', 'mom', 'F', '2', path) args = {'chrom': "1", 'position': 1, 'id': ".", 'ref': "G", 'alts': "T", 'filter': "PASS", 'info': "CQ=splice_region_variant;HGNC=ATRX;MAX_AF=0.0001", 'format': "DP:GT", 'sample': "50:0/1", 'gender': "female", 'mnv_code': 'modified_protein_altering_mnv'} var1 = SNV(**args) args['position'] = 2 args['mnv_code'] = None args['info'] = "CQ=missense_variant;HGNC=ATRX;MAX_AF=0.0001" var2 = SNV(**args) # by default only one variant passes self.assertEqual(self.vcf_loader.open_individual(person), [var2]) # if we include MNVs, then the passing variants swap self.assertEqual(self.vcf_loader.open_individual(person, mnvs={('1', 1): 'modified_protein_altering_mnv', ('1', 2): 'modified_synonymous_mnv'}), [var1]) def test_load_trio(self): ''' test that load_trio() works correctly ''' def make_vcf(person): # make a VCF, where one line would pass the default filtering vcf = make_vcf_header() vcf.append(make_vcf_line(pos=1, extra='HGNC=TEST;MAX_AF=0.0001')) vcf.append(make_vcf_line(pos=2, extra='HGNC=ATRX;MAX_AF=0.0001')) path = os.path.join(self.temp_dir, "{}.vcf.gz".format(person)) self.write_gzipped_vcf(path, vcf) return path child_path = make_vcf('child') mother_path = make_vcf('mother') father_path = make_vcf('father') family = Family('fam_id') family.add_child('sample', 'mother_id', 'father_id', 'female', '2', child_path) family.add_mother('mother_id', '0', '0', 'female', '1', mother_path) family.add_father('father_id', '0', '0', 'male', '1', father_path) family.set_child() # define the parameters and values for the SNV class args = {'chrom': "1", 'position': 2, 'id': ".", 'ref': "G", 'alts': "T", 'filter': "PASS", 'info': "CQ=missense_variant;HGNC=ATRX;MAX_AF=0.0001", 'format': "DP:GT", 'sample': "50:0/1", 'gender': "female", 'mnv_code': None} dad_args = copy.deepcopy(args) dad_args['gender'] = 'male' self.assertEqual(self.vcf_loader.load_trio(family), [TrioGenotypes(chrom="1", pos=2, child=SNV(**args), mother=SNV(**args), father=SNV(**dad_args)) ]) def test_get_parental_var_snv(self): ''' check that get_parental_var() works correctly for SNVs ''' sex = 'F' var = create_snv(sex, '0/1') mom = Person('fam_id', 'mom', '0', '0', 'F', '1', '/PATH') parental = [] # try to get a matching variant for a mother. This will create a default # variant for a missing parental genotype self.assertEqual(self.vcf_loader.get_parental_var(var, parental, mom), SNV(chrom="1", position=150, id=".", ref="A", alts="G", filter="PASS", info=var.get_info_as_string(), format="GT", sample="0/0", gender="female", mnv_code=None)) # now see if we can pick up a variant where it does exist mother_var = create_snv(sex, '0/0') self.assertEqual(self.vcf_loader.get_parental_var(var, [mother_var], mom), mother_var) def test_get_parental_var_cnv(self): ''' check that get_parental_var() works correctly for CNVs ''' sex = 'F' var = create_cnv(sex, 'deNovo') mom = Person('fam_id', 'mom', '0', '0', 'F', '1', '/PATH') parental_vars = [] self.assertEqual(self.vcf_loader.get_parental_var(var, parental_vars, mom), CNV(chrom="1", position=150, id=".", ref="A", alts="<REF>", filter="PASS", info=var.get_info_as_string(), format='INHERITANCE', sample='uncertain', gender="female", mnv_code=None)) # check that even if a CNV exist in the parent at a matching site, we # still create a new CNV objectr for the parent mother_var = create_cnv(sex, 'uncertain') self.assertEqual(self.vcf_loader.get_parental_var(var, [mother_var], mom), CNV(chrom="1", position=150, id=".", ref="A", alts="<REF>", filter="PASS", info=var.get_info_as_string(), format='INHERITANCE', sample='uncertain', gender="female", mnv_code=None)) def test_get_parental_var_cnv_maternally_inherited(self): ''' ''' sex = 'F' mom = Person('fam_id', 'mom', '0', '0', 'F', '1', '/PATH') # check that even if a CNV exist in the parent at a matching site, we # still create a new CNV object for the parent var = create_cnv(sex, 'maternal') self.assertEqual(self.vcf_loader.get_parental_var(var, [], mom), CNV(chrom="1", position=150, id=".", ref="A", alts="<DUP>", filter="PASS", info=var.get_info_as_string(), format='INHERITANCE', sample='uncertain', gender="female", mnv_code=None)) def test_filter_de_novos(self): """ check that filter_de_novos() works correctly """ # make a family without parents family = Family("fam_id") child_gender = "female" family.add_child('child_id', 'mother_id', 'father_id', child_gender, '2', 'child_path') self.vcf_loader.family = family # set up an autosomal variant gender = "M" args = ["1", "100", ".", "T", "G", "PASS", ".", "GT", "0/1", gender] child_var = SNV(*args) # combine the variant into a list of TrioGenotypes child_vars = [child_var] mother_vars = [] father_vars = [] trio_variants = self.vcf_loader.combine_trio_variants(family, child_vars, mother_vars, father_vars) # check that vars without parents get passed through automatically self.assertEqual(self.vcf_loader.filter_de_novos(trio_variants, 0.9), trio_variants) # now add parents to the family family.add_mother("mother_id", '0', '0', 'female', '1', "mother_vcf_path") family.add_father("father_id", '0', '0', 'male', '1', "father_vcf_path") self.vcf_loader.family = family # re-generate the variants list now that parents have been included trio_variants = self.vcf_loader.combine_trio_variants(family, child_vars, mother_vars, father_vars) # check that vars with parents, and that appear to be de novo are # filtered out self.assertEqual(self.vcf_loader.filter_de_novos(trio_variants, 0.9), []) # check that vars with parents, but which are not de novo, are retained mother_vars = child_vars trio_variants = self.vcf_loader.combine_trio_variants(family, child_vars, mother_vars, father_vars) self.assertEqual(self.vcf_loader.filter_de_novos(trio_variants, 0.9), trio_variants) def test_debug_option(self): """ test whether we can set up the class with the debug option """ total_trios = 1 known_genes = {} maf_tags = None # if the debug info isn't available, then the SNV object doesn't use the # debug filter function self.vcf_loader = LoadVCFs(total_trios, maf_tags, known_genes, set(), None, None) self.assertNotEqual(SNV.passes_filters, SNV.passes_filters_with_debug) # if the debug info is passed in, check that the debug filter function # got set correctly self.vcf_loader = LoadVCFs(total_trios, maf_tags, known_genes, set(), "1", "10000") self.assertEqual(SNV.passes_filters, SNV.passes_filters_with_debug)