def _make_dv_call(ref_bases='A', alt_bases='C'): return deepvariant_pb2.DeepVariantCall( variant=variants_pb2.Variant(reference_name='chr1', start=10, end=11, reference_bases=ref_bases, alternate_bases=[alt_bases]), allele_support={'C': _supporting_reads('read1/1', 'read2/1')})
def test_calls_from_allele_counts(self): # Our test AlleleCounts are 5 positions: # # 10: A ref [no reads] # 11: G/C variant # 12: G ref [no reads] # 13: G ref [no reads] # 14: T/C variant # # The ref sites have no reads for ref or any alt simply because it # simplifies comparing them with the expected variant genotype likelihoods. # We aren't testing the correctness of the gvcf calculation here (that's # elsewhere) but rather focusing here on the separation of variants from # gvcf records, and the automatic merging of the gvcf blocks. allele_counter = self.fake_allele_counter(10, [ (0, 0, 'A'), (10, 10, 'G'), (0, 0, 'G'), (0, 0, 'G'), (10, 10, 'T'), ]) fake_candidates = [ deepvariant_pb2.DeepVariantCall( variant=test_utils.make_variant(alleles=['G', 'C'], start=11)), deepvariant_pb2.DeepVariantCall( variant=test_utils.make_variant(alleles=['T', 'C'], start=14)), ] caller = self.make_test_caller(0.01, 100) with mock.patch.object(caller, 'cpp_variant_caller') as mock_cpp: mock_cpp.calls_from_allele_counts.return_value = fake_candidates allele_counters = {'SAMPLE_ID': allele_counter} candidates, _ = caller.calls_and_gvcfs( allele_counters=allele_counters, target_sample='SAMPLE_ID', include_gvcfs=False) expected_allele_counts_param = { 'SAMPLE_ID': allele_counter.counts.return_value } mock_cpp.calls_from_allele_counts.assert_called_once_with( expected_allele_counts_param, 'SAMPLE_ID') self.assertEqual(candidates, fake_candidates)
def test_ignores_reads_with_low_mapping_quality(self, min_base_qual, min_mapping_qual): """Check that we discard reads with low mapping quality. We have the following scenario: position 0 1 2 3 4 5 reference A A C A G read A A A variant C We set the mapping quality of the read to different values of `mapping_qual`. All bases in the read have base quality greater than `min_base_qual`. The read should only be kept if `mapping_qual` > `min_mapping_qual`. Args: min_base_qual: Reads are discarded if the base at a variant start position does not meet this base quality requirement. min_mapping_qual: Reads are discarded if they do not meet this mapping quality requirement. """ dv_call = deepvariant_pb2.DeepVariantCall( variant=variants_pb2.Variant(reference_name='chr1', start=2, end=3, reference_bases='A', alternate_bases=['C'])) read_requirements = reads_pb2.ReadRequirements( min_base_quality=min_base_qual, min_mapping_quality=min_mapping_qual, min_base_quality_mode=reads_pb2.ReadRequirements.ENFORCED_BY_CLIENT ) pie = _make_encoder(read_requirements=read_requirements) for mapping_qual in range(min_mapping_qual + 5): quals = [min_base_qual, min_base_qual, min_base_qual] read = test_utils.make_read('AAA', start=1, cigar='3M', quals=quals, mapq=mapping_qual) actual = pie.encode_read(dv_call, 'AACAG', read, 1, 'C') if mapping_qual < min_mapping_qual: self.assertIsNone(actual) else: self.assertIsNotNone(actual)
def test_keeps_reads_with_low_quality_bases(self, min_base_qual, min_mapping_qual): """Check that we keep reads with adequate quality at variant start position. We have the following scenario: position 0 1 2 3 4 5 reference A A C A G read A A A variant C We set the base quality of the first and third bases in the read to different functions of `base_qual`. The middle position of the read is where the variant starts, and this position always has base quality greater than `min_base_qual`. Thus, the read should always be kept. Args: min_base_qual: Reads are discarded if the base at a variant start position does not meet this base quality requirement. min_mapping_qual: Reads are discarded if they do not meet this mapping quality requirement. """ dv_call = deepvariant_pb2.DeepVariantCall( variant=variants_pb2.Variant(reference_name='chr1', start=2, end=3, reference_bases='A', alternate_bases=['C'])) read_requirements = reads_pb2.ReadRequirements( min_base_quality=min_base_qual, min_mapping_quality=min_mapping_qual, min_base_quality_mode=reads_pb2.ReadRequirements.ENFORCED_BY_CLIENT ) pie = _make_encoder(read_requirements=read_requirements) for base_qual in range(min_base_qual + 5): quals = [base_qual - 1, min_base_qual, base_qual + 1] read = test_utils.make_read('AAA', start=1, cigar='3M', quals=quals, mapq=min_mapping_qual) actual = pie.encode_read(dv_call, 'AACAG', read, 1, 'C') self.assertIsNotNone(actual)
def test_ignores_reads_with_low_quality_bases(self): dv_call = deepvariant_pb2.DeepVariantCall( variant=variants_pb2.Variant( reference_name='chr1', start=2, end=3, reference_bases='A', alternate_bases=['C'])) pie = _make_encoder() # Get the threshold the encoder uses. min_qual = self.options.read_requirements.min_base_quality for qual in range(0, min_qual + 5): quals = [min_qual - 1, qual, min_qual + 1] read = test_utils.make_read('AAA', start=1, cigar='3M', quals=quals) actual = pie.encode_read(dv_call, 'AACAG', read, 1, 'C') if qual < min_qual: self.assertIsNone(actual) else: self.assertIsNotNone(actual)
def test_calls_from_allele_counts(self, include_gvcfs): # Our test AlleleCounts are 5 positions: # # 10: A ref [no reads] # 11: G/C variant # 12: G ref [no reads] # 13: G ref [no reads] # 14: T/C variant # # The ref sites have no reads for ref or any alt simply because it # simplifies comparing them with the expected variant genotype likelihoods. # We aren't testing the correctness of the gvcf calculation here (that's # elsewhere) but rather focusing here on the separation of variants from # gvcf records, and the automatic merging of the gvcf blocks. allele_counter = self.fake_allele_counter(10, [ (0, 0, 'A'), (10, 10, 'G'), (0, 0, 'G'), (0, 0, 'G'), (10, 10, 'T'), ]) fake_candidates = [ deepvariant_pb2.DeepVariantCall( variant=test_utils.make_variant(alleles=['G', 'C'], start=11)), deepvariant_pb2.DeepVariantCall( variant=test_utils.make_variant(alleles=['T', 'C'], start=14)), ] caller = self.make_test_caller(0.01, 100) with mock.patch.object(caller, 'cpp_variant_caller') as mock_cpp: mock_cpp.calls_from_allele_counter.return_value = fake_candidates candidates, gvcfs = caller.calls_from_allele_counter( allele_counter, include_gvcfs) mock_cpp.calls_from_allele_counter.assert_called_once_with( allele_counter) self.assertEqual(candidates, fake_candidates) # We expect our gvcfs to occur at the 10 position and that 12 and 13 have # been merged into a 2 bp block, if enabled. Otherwise should be empty. if include_gvcfs: self.assertLen(gvcfs, 4) # Expected diploid genotype likelihoods when there's no coverage. The # chance of having each genotype is 1/3, in log10 space. flat_gls = np.log10([1.0 / 3] * 3) self.assertGVCF(gvcfs[0], ref='A', start=10, end=11, gq=1, min_dp=0, gls=flat_gls) self.assertGVCF( gvcfs[1], ref='G', start=11, end=12, gq=0, min_dp=20, gls=np.array([-14.0230482368, -7.993606e-15, -14.0230482368]), # The genotype should NOT be called here ("./.") as the likelihood # for het is greater than hom_ref. gts=[-1, -1]) self.assertGVCF(gvcfs[2], ref='G', start=12, end=14, gq=1, min_dp=0, gls=flat_gls) else: self.assertEmpty(gvcfs)
class AlleleFrequencyTest(parameterized.TestCase): @parameterized.parameters( # A SNP. dict(variant=variants_pb2.Variant(reference_name='chr20', start=60168, end=60169, reference_bases='C', alternate_bases=['T']), reference_haplotype='GCACCT', reference_offset=60165, expected_return=[{ 'haplotype': 'GCATCT', 'alt': 'T', 'variant': variants_pb2.Variant(reference_name='chr20', start=60168, end=60169, reference_bases='C', alternate_bases=['T']) }]), # A deletion. dict(variant=variants_pb2.Variant(reference_name='chr20', start=60284, end=60291, reference_bases='ATTCCAG', alternate_bases=['AT']), reference_haplotype='TTTCCATTCCAGTCCAT', reference_offset=60279, expected_return=[{ 'haplotype': 'TTTCCATTCCAT', 'alt': 'AT', 'variant': variants_pb2.Variant(reference_name='chr20', start=60284, end=60291, reference_bases='ATTCCAG', alternate_bases=['AT']) }]), # An insertion. dict(variant=variants_pb2.Variant(reference_name='chr20', start=60279, end=60285, reference_bases='TTTCCA', alternate_bases=['TTTCCATTCCA']), reference_haplotype='TTTCCATTCCAGTCCAT', reference_offset=60279, expected_return=[{ 'haplotype': 'TTTCCATTCCATTCCAGTCCAT', 'alt': 'TTTCCATTCCA', 'variant': variants_pb2.Variant(reference_name='chr20', start=60279, end=60285, reference_bases='TTTCCA', alternate_bases=['TTTCCATTCCA']) }]), # A deletion. dict(variant=variants_pb2.Variant(reference_name='chr20', start=60284, end=60291, reference_bases='ATTCCAG', alternate_bases=['AT']), reference_haplotype='TTTCCATTCCAG', reference_offset=60279, expected_return=[{ 'haplotype': 'TTTCCAT', 'alt': 'AT', 'variant': variants_pb2.Variant(reference_name='chr20', start=60284, end=60291, reference_bases='ATTCCAG', alternate_bases=['AT']) }]), # An insertion. dict(variant=variants_pb2.Variant(reference_name='chr20', start=60279, end=60285, reference_bases='TTTCCA', alternate_bases=['TTTCCATTCCA']), reference_haplotype='TTTCCATTCCAG', reference_offset=60279, expected_return=[{ 'haplotype': 'TTTCCATTCCATTCCAG', 'alt': 'TTTCCATTCCA', 'variant': variants_pb2.Variant(reference_name='chr20', start=60279, end=60285, reference_bases='TTTCCA', alternate_bases=['TTTCCATTCCA']) }])) def test_update_haplotype(self, variant, reference_haplotype, reference_offset, expected_return): list_hap_obj = allele_frequency.update_haplotype( variant, reference_haplotype, reference_offset) self.assertListEqual(list_hap_obj, expected_return) @parameterized.parameters([ dict(dv_variant=variants_pb2.Variant(reference_name='chr20', start=60284, end=60291, reference_bases='ATTCCAG', alternate_bases=['AT']), cohort_variants=[ variants_pb2.Variant(reference_name='chr20', start=60279, end=60285, reference_bases='TTTCCA', alternate_bases=['T', 'TTTCCATTCCA']), variants_pb2.Variant(reference_name='chr20', start=60285, end=60291, reference_bases='TTTCCA', alternate_bases=['T']), ], expected_ref_haplotype='TTTCCATTCCAG', expected_ref_offset=60279) ]) def test_get_ref_haplotype_and_offset(self, dv_variant, cohort_variants, expected_ref_haplotype, expected_ref_offset): ref_reader = fasta.IndexedFastaReader(testdata.GRCH38_FASTA) ref_haplotype, ref_offset = allele_frequency.get_ref_haplotype_and_offset( dv_variant, cohort_variants, ref_reader) self.assertEqual(ref_haplotype, expected_ref_haplotype) self.assertEqual(ref_offset, expected_ref_offset) @parameterized.parameters( # A matched SNP. dict(variant=variants_pb2.Variant(reference_name='chr20', start=60168, end=60169, reference_bases='C', alternate_bases=['T']), expected_return=dict(C=0.9998, T=0.0002), label='matched_snp_1'), # A matched deletion. dict(variant=variants_pb2.Variant(reference_name='chr20', start=60285, end=60291, reference_bases='TTCCAG', alternate_bases=['T']), expected_return=dict(T=0.001198, TTCCAG=0.998802), label='matched_del_1'), # A unmatched deletion. dict(variant=variants_pb2.Variant(reference_name='chr20', start=60284, end=60291, reference_bases='ATTCCAG', alternate_bases=['A']), expected_return=dict(A=0, ATTCCAG=1), label='unmatched_del_1'), # A matched deletion, where the candidate is formatted differently. dict(variant=variants_pb2.Variant(reference_name='chr20', start=60284, end=60291, reference_bases='ATTCCAG', alternate_bases=['AT']), expected_return=dict(AT=0.001198, ATTCCAG=0.998802), label='matched_del_2: diff representation'), # An unmatched SNP. dict(variant=variants_pb2.Variant(reference_name='chr20', start=60150, end=60151, reference_bases='C', alternate_bases=['T']), expected_return=dict(C=1, T=0), label='unmatched_snp_1'), # A matched SNP and an unmatched SNP. dict(variant=variants_pb2.Variant(reference_name='chr20', start=60168, end=60169, reference_bases='C', alternate_bases=['T', 'A']), expected_return=dict(C=0.9998, T=0.0002, A=0), label='mixed_snp_1'), # An unmatched SNP, where the REF allele frequency is not 1. dict(variant=variants_pb2.Variant(reference_name='chr20', start=60168, end=60169, reference_bases='C', alternate_bases=['A']), expected_return=dict(C=0.9998, A=0), label='unmatched_snp_2: non-1 ref allele'), # A multi-allelic candidate at a multi-allelic locus. dict(variant=variants_pb2.Variant(reference_name='chr20', start=60279, end=60285, reference_bases='TTTCCA', alternate_bases=['T', 'TTTCCATTCCA']), expected_return=dict(TTTCCA=0.999401, T=0.000399, TTTCCATTCCA=0.0002), label='matched_mult_1'), # A multi-allelic candidate at a multi-allelic locus. dict(variant=variants_pb2.Variant(reference_name='chr20', start=60279, end=60285, reference_bases='TTTCCA', alternate_bases=['T', 'TATCCATTCCA']), expected_return=dict(TTTCCA=0.999401, T=0.000399, TATCCATTCCA=0), label='unmatched_mult_1'), # [Different representation] # A deletion where the cohort variant is represented differently. # In this case, REF frequency is calculated by going over all cohort ALTs. # Thus, the sum of all dict values is not equal to 1. dict(variant=variants_pb2.Variant(reference_name='chr20', start=60295, end=60301, reference_bases='TTCCAT', alternate_bases=['T']), expected_return=dict(T=0.000399, TTCCAT=0.923922), label='matched_del_3: diff representation'), # [Non-candidate allele] # One allele of a multi-allelic cohort variant is not in candidate. # The non-candidate allele should be ignored. dict(variant=variants_pb2.Variant(reference_name='chr20', start=60279, end=60285, reference_bases='TTTCCA', alternate_bases=['T']), expected_return=dict(TTTCCA=0.999401, T=0.000399), label='matched_del_4: multi-allelic cohort'), # A left-align example. dict(variant=variants_pb2.Variant(reference_name='chr20', start=9074790, end=9074794, reference_bases='CT', alternate_bases=['C', 'CTTT']), expected_return=dict(C=0.167732, CTTT=0.215256, CT=0.442092), label='matched_mult_2: left align'), # A left-align example. dict(variant=variants_pb2.Variant(reference_name='chr20', start=9074790, end=9074794, reference_bases='C', alternate_bases=['CTTT']), expected_return=dict(CTTT=0.145367, C=0.442092), label='matched_ins_1: left align'), # A left-align example. dict(variant=variants_pb2.Variant(reference_name='chr20', start=9074790, end=9074793, reference_bases='CTT', alternate_bases=['CTTA']), expected_return=dict(CTTA=0, CTT=0.442092), label='unmatched_ins_1: left align'), # A matched mnps. dict(variant=variants_pb2.Variant(reference_name='chr20', start=61065, end=61066, reference_bases='T', alternate_bases=['C']), expected_return=dict(C=0.079872, T=0.919729), label='matched_mnps_1'), # A matched SNP. dict(variant=variants_pb2.Variant(reference_name='chr20', start=62022, end=62023, reference_bases='G', alternate_bases=['C', 'T']), expected_return=dict(G=0.996206, C=0.003594, T=0), label='matched_snp_2')) def test_find_matching_allele_frequency(self, variant, expected_return, label): ref_reader = fasta.IndexedFastaReader(testdata.GRCH38_FASTA) vcf_reader = vcf.VcfReader(testdata.VCF_WITH_ALLELE_FREQUENCIES) allele_frequencies = allele_frequency.find_matching_allele_frequency( variant, vcf_reader, ref_reader) # Compare keys. self.assertSetEqual(set(allele_frequencies.keys()), set(expected_return.keys()), msg=label) # Compare values (almost equal). for key in allele_frequencies.keys(): self.assertAlmostEqual(allele_frequencies[key], expected_return[key], msg=label) def test_make_population_vcf_readers_with_multiple_vcfs(self): filenames = [testdata.AF_VCF_CHR20, testdata.AF_VCF_CHR21] output = allele_frequency.make_population_vcf_readers(filenames) self.assertIsInstance(output['chr20'], vcf.VcfReader) self.assertIsInstance(output['chr21'], vcf.VcfReader) self.assertEqual(next(output['chr20']).reference_name, 'chr20') self.assertEqual(next(output['chr21']).reference_name, 'chr21') # Check that chr22 has no reader rather than outputting another reader for # a different chromosome. self.assertIsNone(output['chr22']) def test_make_population_vcf_readers_with_one_vcf(self): filenames = [testdata.AF_VCF_CHR20_AND_21] output = allele_frequency.make_population_vcf_readers(filenames) self.assertIsInstance(output['chr20'], vcf.VcfReader) self.assertIsInstance(output['chr21'], vcf.VcfReader) self.assertIsInstance(output['chr22'], vcf.VcfReader) # All reference names should map to the same VCF that starts with chr20. self.assertEqual(next(output['chr20']).reference_name, 'chr20') self.assertEqual(next(output['chr21']).reference_name, 'chr20') self.assertEqual(next(output['chr22']).reference_name, 'chr20') def test_make_population_vcf_readers_raises_on_shared_chromosomes(self): filenames = [ testdata.AF_VCF_CHR20, testdata.AF_VCF_CHR21, testdata.AF_VCF_CHR20_AND_21 ] with self.assertRaisesRegex( expected_exception=ValueError, expected_regex='Variants on chr20 are included in multiple VCFs' ): allele_frequency.make_population_vcf_readers(filenames) @parameterized.parameters( dict(dv_calls=iter([ deepvariant_pb2.DeepVariantCall(variant=variants_pb2.Variant( reference_name='chr20', start=60168, end=60169, reference_bases='C', alternate_bases=['T']), allele_support=None) ]), expected_return=dict(C=0.9998, T=0.0002), testcase='valid'), dict(dv_calls=iter([ deepvariant_pb2.DeepVariantCall(variant=variants_pb2.Variant( reference_name='chrM', start=10000, end=10001, reference_bases='T', alternate_bases=['G']), allele_support=None) ]), expected_return=dict(T=1, G=0), testcase='no VCF')) def test_add_allele_frequencies_to_candidates(self, dv_calls, expected_return, testcase): if testcase == 'valid': pop_vcf_reader = vcf.VcfReader( testdata.VCF_WITH_ALLELE_FREQUENCIES) ref_reader = fasta.IndexedFastaReader(testdata.GRCH38_FASTA) elif testcase == 'no VCF': pop_vcf_reader = None ref_reader = None else: raise ValueError('Invalid testcase for parameterized test.') updated_dv_call = list( allele_frequency.add_allele_frequencies_to_candidates( dv_calls, pop_vcf_reader, ref_reader)) actual_frequency = updated_dv_call[0].allele_frequency # Compare keys. self.assertSetEqual(set(actual_frequency.keys()), set(expected_return.keys())) # Compare values (almost equal). for key in actual_frequency.keys(): self.assertAlmostEqual(actual_frequency[key], expected_return[key])