def test_label_variant(self): variant = test_utils.make_variant(start=10, alleles=['A', 'C']) tvariant = test_utils.make_variant(start=10, alleles=['A', 'C'], gt=[0, 1]) example = tf_utils.make_example(variant, ['C'], 'foo', self.default_shape, self.default_format) labeler = mock.Mock() labeler.match = mock.Mock(return_value=[True, tvariant]) labeler.match_to_alt_count = mock.Mock(return_value=1) self.processor.labeler = labeler labeled = example_pb2.Example() labeled.CopyFrom(example) self.processor.label_variant(labeled, variant) labeler.match.assert_called_once_with(variant) labeler.match_to_alt_count.assert_called_once_with( variant, tvariant, ['C']) for key, value in example.features.feature.iteritems(): self.assertEqual(value, labeled.features.feature[key]) self.assertEqual(1, tf_utils.example_label(labeled)) self.assertEqual(tvariant, tf_utils.example_truth_variant(labeled))
def test_match_to_genotype_label(self, variant_alleles, alt_alleles, truth_alleles, truth_gt, expected_n_alts): variant = test_utils.make_variant(start=10, alleles=variant_alleles) truth_variant = test_utils.make_variant( start=10, alleles=truth_alleles, gt=truth_gt) self.assertEqual(expected_n_alts, self.labeler.match_to_alt_count(variant, truth_variant, alt_alleles))
def test_match_selects_variant_by_start(self): # Tests that match() selects the variant at the same start even if that # variant doesn't have the same alleles at candidate and there's an # overlapping with the same alleles. overlapping = [ test_utils.make_variant(start=20, alleles=['CC', 'A']), test_utils.make_variant(start=21, alleles=['AAA', 'A']), test_utils.make_variant(start=22, alleles=['AA', 'A']), ] self.labeler = variant_labeler.VariantLabeler( vcf_reader=mock_vcf_reader(overlapping)) candidate = test_utils.make_variant(start=21, alleles=['CC', 'A']) self.assertEqual(self.labeler.match(candidate)[1], overlapping[1])
def testSelectVariantsWeights(self): variants = [ test_utils.make_variant(start=10, alleles=['C', 'T']), test_utils.make_variant(start=11, alleles=['C', 'TA']), test_utils.make_variant(start=12, alleles=['C', 'A']), test_utils.make_variant(start=13, alleles=['CA', 'T']), ] encoded = tf.constant([v.SerializeToString() for v in variants]) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) op = model_eval.select_variants_weights( variantutils.is_snp, encoded, name='tf_is_snp') self.assertTrue(op.name.startswith('tf_is_snp')) npt.assert_array_equal(op.eval(), [1.0, 0.0, 1.0, 0.0])
def test_match_to_genotype_label_no_gt_truth_variant_raises(self): with self.assertRaisesRegexp(ValueError, 'truth_variant needs genotypes'): self.labeler.match_to_alt_count( self.snp, test_utils.make_variant(start=10, alleles=['A', 'C']), self.snp.alternate_bases)
def _create_variant(ref_name, start, ref_base, alt_bases, qual, filter_field, genotype, gq, likelihoods): """Creates a Variant record for testing. Args: ref_name: reference name for this variant start: start position on the contig ref_base: reference base(s) alt_bases: list(str). alternate base(s) qual: PHRED scaled detection probability filter_field: filter string for this variant genotype: list of integers corresponding to the called genotype gq: PHRED scaled genotype quality likelihoods: genotype likelihoods for this variant Returns: A Variant record created with the specified arguments. """ return test_utils.make_variant(chrom=ref_name, start=start, alleles=[ref_base] + alt_bases, qual=qual, filters=filter_field, gt=genotype, gq=gq, gls=likelihoods, sample_name=_DEFAULT_SAMPLE_NAME)
def test_create_pileup_examples(self): self.processor.pic = mock.Mock() self.add_mock('_encode_tensor', side_effect=[ ('tensor1', self.default_shape, self.default_format), ('tensor2', self.default_shape, self.default_format) ]) dv_call = mock.Mock() dv_call.variant = test_utils.make_variant(start=10, alleles=['A', 'C', 'G']) ex = mock.Mock() alt1, alt2 = ['C'], ['G'] self.processor.pic.create_pileup_images.return_value = [ (alt1, 'tensor1'), (alt2, 'tensor2') ] actual = self.processor.create_pileup_examples(dv_call) self.processor.pic.create_pileup_images.assert_called_once_with( dv_call) self.assertEquals(len(actual), 2) for ex, (alt, img) in zip(actual, [(alt1, 'tensor1'), (alt2, 'tensor2')]): self.assertEqual(tf_utils.example_alt_alleles(ex), alt) self.assertEqual(tf_utils.example_variant(ex), dv_call.variant) self.assertEqual(tf_utils.example_encoded_image(ex), img) self.assertEqual(tf_utils.example_image_shape(ex), self.default_shape) self.assertEqual(tf_utils.example_image_format(ex), self.default_format)
def test_label_variant_raises_for_non_confident_variant(self): variant = test_utils.make_variant(start=10, alleles=['A', 'C'], gt=[0, 1]) self.processor.labeler = mock.Mock() self.processor.labeler.match = mock.Mock(return_value=[False, variant]) example = tf_utils.make_example(variant, ['C'], 'foo', self.default_shape, self.default_format) self.assertFalse(self.processor.label_variant(example, variant))
def _var(chrom='1', start=5, end=None, ref=None, alt=None, qual=50, genotype=None, likelihoods=None, sample_name='NA12878'): """Creates a Variant record for testing. Args: chrom: reference name for this variant start: start position on the contig end: end position on the contig ref: reference base(s) alt: list(str). alternate base(s) qual: PHRED scaled detection probability genotype: list of integers corresponding to the called genotype likelihoods: genotype likelihoods for this variant sample_name: sample name for the single call in the variant Returns: A Variant record created with the specified arguments. Raises: ValueError: Both ref and end are specified, and are inconsistent. """ if ref is None and end is None: ref = 'A' elif ref is None: ref = 'A' * (end - start) elif ref is not None and end is not None and end != start + len(ref): raise ValueError('Inconsistent end and reference allele.') if alt is None: alt = ['C'] if genotype is None: genotype = [0, 1] if likelihoods is None: likelihoods = [-1.0, -0.0506099933550872, -2.0] return test_utils.make_variant(chrom=chrom, start=start, alleles=[ref] + alt, qual=qual, filters=None, gt=genotype, gls=likelihoods, sample_name=sample_name)
def _create_nonvariant(ref_name, start, end): """Creates a non-variant Variant record for testing. Args: ref_name: str. Reference name for this variant. start: int. start position on the contig [0-based, half open). end: int. end position on the contig [0-based, half open). Returns: A non-variant Variant record created with the specified arguments. """ return test_utils.make_variant( chrom=ref_name, start=start, end=end, alleles=['A', variantutils.GVCF_ALT_ALLELE])
def _simple_variant(ref_name, start, ref_base): """Creates a Variant record for testing variant and non-variant merge. Args: ref_name: str. Reference name for this variant. start: int. start position on the contig [0-based, half open). ref_base: str. reference base(s). Returns: A Variant record created with the specified arguments. """ return test_utils.make_variant( chrom=ref_name, start=start, end=start + len(ref_base), alleles=[ref_base, 'A' if ref_base != 'A' else 'C'])
class VariantLabelerTest(parameterized.TestCase): # Confident variants: SNP, deletion, and multi-allelic. snp = test_utils.make_variant(start=10, alleles=['A', 'C'], gt=[0, 1]) deletion = test_utils.make_variant(start=20, alleles=['ACG', 'A']) multiallelic = test_utils.make_variant(start=30, alleles=['ACT', 'ACTGT', 'A']) # Outside our confident regions. non_confident = test_utils.make_variant(start=200, alleles=['A', 'C']) filtered = test_utils.make_variant(start=40, alleles=['A', 'C'], filters='FAILED') filtered_match = test_utils.make_variant(start=40, alleles=['A', 'C'], gt=[0, 0]) variants = [snp, deletion, multiallelic, non_confident, filtered] def setUp(self): self.labeler = variant_labeler.VariantLabeler( vcf_reader=mock_vcf_reader(self.variants), confident_regions=ranges.RangeSet( [ranges.make_range(self.snp.reference_name, 10, 100)])) @parameterized.parameters( # Simple tests: we get back our matching variants in the confident regions (snp, True, snp), (deletion, True, deletion), (multiallelic, True, multiallelic), # Test the behavior outside of our confident regions. # We get back non_confident since it matches but we're not confident. (non_confident, False, non_confident), # No matching variant, so we get a None as well as False. (test_utils.make_variant(start=300, alleles=['A', 'C']), False, None), # This variant doesn't have any match but we're confident in it. (test_utils.make_variant(start=15, alleles=['C', 'A']), True, test_utils.make_variant(start=15, alleles=['C', 'A'], gt=[0, 0])), # These variant start at our SNP but has a different allele. We are # confident and we get back the true snp variant, despite having the # different alleles. (test_utils.make_variant(start=snp.start, alleles=['A', 'G' ]), True, snp), (test_utils.make_variant(start=snp.start, alleles=['AC', 'C' ]), True, snp), (test_utils.make_variant(start=snp.start, alleles=['A', 'CA' ]), True, snp), # We don't match filtered variants. (filtered, True, filtered_match), ) def test_match(self, candidate, expected_confident, expected_variant): actual_confident, actual_variant = self.labeler.match(candidate) self.assertEqual(expected_confident, actual_confident) self.assertEqual(expected_variant, actual_variant) def test_match_selects_variant_by_start(self): # Tests that match() selects the variant at the same start even if that # variant doesn't have the same alleles at candidate and there's an # overlapping with the same alleles. overlapping = [ test_utils.make_variant(start=20, alleles=['CC', 'A']), test_utils.make_variant(start=21, alleles=['AAA', 'A']), test_utils.make_variant(start=22, alleles=['AA', 'A']), ] self.labeler = variant_labeler.VariantLabeler( vcf_reader=mock_vcf_reader(overlapping)) candidate = test_utils.make_variant(start=21, alleles=['CC', 'A']) self.assertEqual(self.labeler.match(candidate)[1], overlapping[1]) @parameterized.parameters( # Make sure we get the right alt counts for all diploid genotypes. (['A', 'C'], ['C'], ['A', 'C'], [0, 0], 0), (['A', 'C'], ['C'], ['A', 'C'], [0, 1], 1), (['A', 'C'], ['C'], ['A', 'C'], [1, 0], 1), (['A', 'C'], ['C'], ['A', 'C'], [1, 1], 2), # Basic multi-allelic tests, without having to deal with simplifying # alleles as all of the alleles are SNPs. Our candidates have an extra # allele, but the true GT is A/C. (['A', 'C', 'G'], ['C'], ['A', 'C'], [0, 1], 1), (['A', 'C', 'G'], ['C'], ['A', 'C'], [1, 1], 2), # When considering A/G our answer should be 0 as we have no copies # of the G allele. (['A', 'C', 'G'], ['G'], ['A', 'C'], [0, 1], 0), (['A', 'C', 'G'], ['G'], ['A', 'C'], [1, 1], 0), # We are considering the het-alt configuration here of A vs. C+G. We've # got one copy of the C allele so our true genotype is het. If truth is # hom-var for the C, though, we again label the composite as hom_var as # we have two copies of the C/G alt. (['A', 'C', 'G'], ['C', 'G'], ['A', 'C'], [0, 1], 1), (['A', 'C', 'G'], ['C', 'G'], ['A', 'C'], [1, 1], 2), # Here we have an extra allele in truth, while candidate is bi-allelic. # This example 'G' is unused in truth, so we are simply the normal # bi-allelic result. (['A', 'C'], ['C'], ['A', 'C', 'G'], [0, 0], 0), (['A', 'C'], ['C'], ['A', 'C', 'G'], [0, 1], 1), (['A', 'C'], ['C'], ['A', 'C', 'G'], [1, 1], 2), # We check here that we get the bi-allelic result even when the extra # allele is in position 1 not 2. (['A', 'G'], ['G'], ['A', 'C', 'G'], [0, 0], 0), (['A', 'G'], ['G'], ['A', 'C', 'G'], [0, 2], 1), (['A', 'G'], ['G'], ['A', 'C', 'G'], [2, 2], 2), # Now for a real het-alt. We've got three alleles in both, and the true # genotype is 1/2. (['A', 'C', 'G'], ['C'], ['A', 'C', 'G'], [1, 2], 1), (['A', 'C', 'G'], ['G'], ['A', 'C', 'G'], [1, 2], 1), (['A', 'C', 'G'], ['C', 'G'], ['A', 'C', 'G'], [1, 2], 2), # Test ll possible values in candidate against het-alt: (['A', 'C', 'G', 'T'], ['C'], ['A', 'C', 'G'], [1, 2], 1), (['A', 'C', 'G', 'T'], ['G'], ['A', 'C', 'G'], [1, 2], 1), (['A', 'C', 'G', 'T'], ['T'], ['A', 'C', 'G'], [1, 2], 0), (['A', 'C', 'G', 'T'], ['C', 'G'], ['A', 'C', 'G'], [1, 2], 2), (['A', 'C', 'G', 'T'], ['C', 'T'], ['A', 'C', 'G'], [1, 2], 1), (['A', 'C', 'G', 'T'], ['G', 'T'], ['A', 'C', 'G'], [1, 2], 1), # Simple start for indel alleles => exact matching works here. (['A', 'AC'], ['AC'], ['A', 'AC'], [0, 0], 0), (['A', 'AC'], ['AC'], ['A', 'AC'], [0, 1], 1), (['A', 'AC'], ['AC'], ['A', 'AC'], [1, 1], 2), # We've got a multi-allelic truth, but again exact matching is enough. (['A', 'AC'], ['AC'], ['A', 'AC', 'ACC'], [0, 0], 0), (['A', 'AC'], ['AC'], ['A', 'AC', 'ACC'], [0, 1], 1), (['A', 'AC'], ['AC'], ['A', 'AC', 'ACC'], [1, 1], 2), (['A', 'AC'], ['AC'], ['A', 'AC', 'ACC'], [0, 2], 0), (['A', 'AC'], ['AC'], ['A', 'AC', 'ACC'], [1, 2], 1), (['A', 'AC'], ['AC'], ['A', 'AC', 'ACC'], [2, 2], 0), # This case has an extra allele (A) in truth but the true genotype # corresponds to our candidate alleles exactly. (['A', 'AC'], ['AC'], ['AC', 'A', 'ACC'], [0, 2], 1), (['A', 'AC'], ['AC'], ['AC', 'A', 'ACC'], [2, 2], 2), # If the true genotype involved just the deletion (A) allele, we don't # have that allele in our candidate so we always get 0 copies. (['A', 'AC'], ['AC'], ['AC', 'A', 'ACC'], [0, 1], 0), (['A', 'AC'], ['AC'], ['AC', 'A', 'ACC'], [1, 1], 0), # If the truth is het-alt, we can't match the deletion A allele but we do # in fact have the A => AC allele as this matches the AC => ACC allele in # truth set. (['A', 'AC'], ['AC'], ['AC', 'A', 'ACC'], [1, 2], 1), # We have a multi-allelic candidate but a simple bi-allelic truth. Make # sure we match correctly. This is an key case, as we should expect that # our candidates frequently have extra alleles changing the represention # relative to our truth candidates. (['ACT', 'A', 'AACT'], ['A'], ['A', 'AA'], [0, 1], 0), (['ACT', 'A', 'AACT'], ['A'], ['A', 'AA'], [1, 1], 0), (['ACT', 'A', 'AACT'], ['AACT'], ['A', 'AA'], [0, 1], 1), (['ACT', 'A', 'AACT'], ['AACT'], ['A', 'AA'], [1, 1], 2), (['ACT', 'A', 'AACT'], ['A', 'AACT'], ['A', 'AA'], [0, 1], 1), (['ACT', 'A', 'AACT'], ['A', 'AACT'], ['A', 'AA'], [1, 1], 2), # The whole complexity: multi-allelic candidate and truth, all with # different allele representations. # True genotype here is A/AGTGT where ref is AGT [common # dinucleotide expansion]. Both candidate and truth have this but each # as a different ref so none of the alleles exactly match. (['AGTGT', 'A', 'AGT', 'AGTGTGT' ], ['A'], ['AGT', 'A', 'AGTGT', 'AGTGTGT'], [1, 2], 0), (['AGTGT', 'A', 'AGT', 'AGTGTGT' ], ['AGT'], ['AGT', 'A', 'AGTGT', 'AGTGTGT'], [1, 2], 1), (['AGTGT', 'A', 'AGT', 'AGTGTGT' ], ['AGTGTGT'], ['AGT', 'A', 'AGTGT', 'AGTGTGT'], [1, 2], 1), (['AGTGT', 'A', 'AGT', 'AGTGTGT' ], ['A', 'AGT'], ['AGT', 'A', 'AGTGT', 'AGTGTGT'], [1, 2], 1), (['AGTGT', 'A', 'AGT', 'AGTGTGT' ], ['A', 'AGTGTGT'], ['AGT', 'A', 'AGTGT', 'AGTGTGT'], [1, 2], 1), (['AGTGT', 'A', 'AGT', 'AGTGTGT' ], ['AGT', 'AGTGTGT'], ['AGT', 'A', 'AGTGT', 'AGTGTGT'], [1, 2], 2), # Misc. checks with block substititions. (['AT', 'A', 'GC'], ['A'], ['ATT', 'AT', 'A'], [0, 1], 1), (['AT', 'A', 'GT'], ['A'], ['A', 'G'], [0, 1], 0), (['AT', 'A', 'GT'], ['GT'], ['A', 'G'], [0, 1], 1), ) def test_match_to_genotype_label(self, variant_alleles, alt_alleles, truth_alleles, truth_gt, expected_n_alts): variant = test_utils.make_variant(start=10, alleles=variant_alleles) truth_variant = test_utils.make_variant(start=10, alleles=truth_alleles, gt=truth_gt) self.assertEqual( expected_n_alts, self.labeler.match_to_alt_count(variant, truth_variant, alt_alleles)) def test_match_to_genotype_label_none_truth_variant_raises(self): with self.assertRaisesRegexp(ValueError, 'truth_variant cannot be None'): self.labeler.match_to_alt_count(self.snp, None, self.snp.alternate_bases) def test_match_to_genotype_label_no_gt_truth_variant_raises(self): with self.assertRaisesRegexp(ValueError, 'truth_variant needs genotypes'): self.labeler.match_to_alt_count( self.snp, test_utils.make_variant(start=10, alleles=['A', 'C']), self.snp.alternate_bases) def test_match_to_genotype_label_none_variant_raises(self): with self.assertRaisesRegexp(ValueError, 'variant cannot be None'): self.labeler.match_to_alt_count(None, self.snp, self.snp.alternate_bases) def test_match_to_genotype_label_ref_variant_raises(self): with self.assertRaisesRegexp( ValueError, 'variant must have at least one alternate allele'): self.labeler.match_to_alt_count( test_utils.make_variant(start=10, alleles=['A']), self.snp, self.snp.alternate_bases)
def test_match_to_genotype_label_ref_variant_raises(self): with self.assertRaisesRegexp( ValueError, 'variant must have at least one alternate allele'): self.labeler.match_to_alt_count( test_utils.make_variant(start=10, alleles=['A']), self.snp, self.snp.alternate_bases)
def test_invalid_nonref_genotype_count(self): zero_calls_variant = test_utils.make_variant() with self.assertRaisesRegexp(ValueError, 'Expecting only single-sample'): haplotypes._nonref_genotype_count(zero_calls_variant)
def test_calls_from_allele_counts(self, include_gvcfs): # Our test AlleleCounts are 5 positions: # # 10: A ref [no reads] # 11: G/C variant # 12: G ref [no reads] # 13: G ref [no reads] # 14: T/C variant # # The ref sites have no reads for ref or any alt simply because it # simplifies comparing them with the expected variant genotype likelihoods. # We aren't testing the correctness of the gvcf calculation here (that's # elsewhere) but rather focusing here on the separation of variants from # gvcf records, and the automatic merging of the gvcf blocks. allele_counter = self.fake_allele_counter(10, [ (0, 0, 'A'), (10, 10, 'G'), (0, 0, 'G'), (0, 0, 'G'), (10, 10, 'T'), ]) fake_candidates = [ deepvariant_pb2.DeepVariantCall( variant=test_utils.make_variant(alleles=['G', 'C'], start=11)), deepvariant_pb2.DeepVariantCall( variant=test_utils.make_variant(alleles=['T', 'C'], start=14)), ] caller = self.make_test_caller(0.01, 100) with mock.patch.object(caller, 'cpp_variant_caller') as mock_cpp: mock_cpp.calls_from_allele_counter.return_value = fake_candidates candidates, gvcfs = caller.calls_from_allele_counter( allele_counter, include_gvcfs) mock_cpp.calls_from_allele_counter.assert_called_once_with( allele_counter) self.assertEqual(candidates, fake_candidates) # We expect our gvcfs to occur at the 10 position and that 12 and 13 have # been merged into a 2 bp block, if enabled. Otherwise should be empty. if include_gvcfs: self.assertLen(gvcfs, 4) # Expected diploid genotype likelihoods when there's no coverage. The # chance of having each genotype is 1/3, in log10 space. flat_gls = np.log10([1.0 / 3] * 3) self.assertGVCF(gvcfs[0], ref='A', start=10, end=11, gq=1, gls=flat_gls) self.assertGVCF(gvcfs[1], ref='G', start=11, end=12, gq=0, gls=np.array([ -14.0230482368, -8.32667268469e-15, -14.0230482368 ])) self.assertGVCF(gvcfs[2], ref='G', start=12, end=14, gq=1, gls=flat_gls) else: self.assertEmpty(gvcfs)