def testExampleSetTruthVariant(self): example = tf_utils.make_example(self.variant, self.alts, self.encoded_image, self.default_shape, self.default_format) full_tvariant = variants_pb2.Variant( variant_set_id='variant_set_id', id='id', names=['name1'], created=1234, reference_name='1', start=10, end=11, reference_bases='C', alternate_bases=['A'], filter=['PASS'], quality=1234.5, calls=[ variants_pb2.VariantCall(call_set_id='call_set_id', call_set_name='call_set_name', genotype=[0, 1], phaseset='phaseset', genotype_likelihood=[0.1, 0.2, 0.3]) ]) test_utils.set_list_values(full_tvariant.info['key'], [1]) test_utils.set_list_values(full_tvariant.calls[0].info['key'], [2]) simple_tvariant = variants_pb2.Variant( reference_name='1', start=10, end=11, reference_bases='C', alternate_bases=['A'], filter=['PASS'], quality=1234.5, calls=[ variants_pb2.VariantCall(call_set_name='call_set_name', genotype=[0, 1]) ]) test_utils.set_list_values(simple_tvariant.calls[0].info['key'], [2]) self.assertIsNotAFeature('truth_variant/encoded', example) tf_utils.example_set_truth_variant(example, full_tvariant, simplify=False) self.assertEqual(full_tvariant, tf_utils.example_truth_variant(example)) # Check that reencoding with simplify=True produces the simplified version. tf_utils.example_set_truth_variant(example, full_tvariant, simplify=True) self.assertEqual(simple_tvariant, tf_utils.example_truth_variant(example))
def test_add_call_to_variant(self, probs, expected): raw_variant = variants_pb2.Variant( reference_name=expected.reference_name, reference_bases=expected.reference_bases, alternate_bases=expected.alternate_bases, start=expected.start, end=expected.end, calls=[ variants_pb2.VariantCall(call_set_name=_DEFAULT_SAMPLE_NAME) ]) variant = postprocess_variants.add_call_to_variant( variant=raw_variant, predictions=probs, sample_name=_DEFAULT_SAMPLE_NAME) self.assertEqual(variant.reference_bases, expected.reference_bases) self.assertEqual(variant.alternate_bases, expected.alternate_bases) self.assertEqual(variant.reference_name, expected.reference_name) self.assertEqual(variant.start, expected.start) self.assertEqual(variant.end, expected.end) self.assertAlmostEquals(variant.quality, expected.quality, places=6) self.assertEqual(variant.filter, expected.filter) self.assertEqual(len(variant.calls), 1) self.assertEqual(len(expected.calls), 1) self.assertEqual(variant.calls[0].genotype, expected.calls[0].genotype) self.assertEqual(variant.calls[0].info['GQ'], expected.calls[0].info['GQ']) for gl, expected_gl in zip(variant.calls[0].genotype_likelihood, expected.calls[0].genotype_likelihood): self.assertAlmostEquals(gl, expected_gl, places=6)
def _create_variant_with_alleles(ref=None, alts=None, start=0): """Creates a Variant record with specified alternate_bases.""" return variants_pb2.Variant( reference_bases=ref, alternate_bases=alts, start=start, calls=[variants_pb2.VariantCall(call_set_name=_DEFAULT_SAMPLE_NAME)])
def test_exception_extract_single_variant_name(self, names): variant_calls = [ variants_pb2.VariantCall(call_set_name=name) for name in names ] variant = variants_pb2.Variant(calls=variant_calls) record = deepvariant_pb2.CallVariantsOutput(variant=variant) with self.assertRaisesRegexp(ValueError, 'Error extracting name:'): postprocess_variants._extract_single_sample_name(record)
def _make_synthetic_hom_ref(self, variant): """Creates a version of variant with a hom-ref genotype. Args: variant: Our candidate learning.genomics.deepvariant.core.genomics.Variant variant. Returns: A new Variant with the same position and alleles as variant but with a hom-ref genotype. """ return variants_pb2.Variant( reference_name=variant.reference_name, start=variant.start, end=variant.end, reference_bases=variant.reference_bases, alternate_bases=variant.alternate_bases, calls=[variants_pb2.VariantCall(genotype=[0, 0])])
def test_unsupported_genotype_likelihood(self): variantcall = variants_pb2.VariantCall( genotype_likelihood=[-1, -2, -3]) with self.assertRaisesRegexp(NotImplementedError, 'only supports haploid and diploid'): variantutils.genotype_likelihood(variantcall, [0, 1, 1])
def test_genotype_likelihood(self, gls, allele_indices, expected): variantcall = variants_pb2.VariantCall(genotype_likelihood=gls) actual = variantutils.genotype_likelihood(variantcall, allele_indices) self.assertEqual(actual, expected)
def _simplify_variant_call(call): """Returns a new VariantCall with the basic fields of call.""" return variants_pb2.VariantCall( call_set_name=call.call_set_name, genotype=call.genotype, info=dict(call.info)) # dict() is necessary to actually set info.
def make_gvcfs(self, allele_count_summaries): """Primary interface function for computing gVCF confidence at a site. Looks at the counts in the provided list of AlleleCountSummary protos and returns properly-formatted Variant protos containing gVCF reference blocks for all sites in allele_count_summaries. The returned Variant has reference_name, start, end are set and contains a single VariantCall in the calls field with call_set_name of options.sample_name, genotypes set to 0/0 (diploid reference), and a GQ value bound in the info field appropriate to the data in allele_count. The provided allele count must have either a canonical DNA sequence base ( A, C, G, T) or be "N". Args: allele_count_summaries: iterable of AlleleCountSummary protos in coordinate-sorted order. Each proto is used to get the read counts for reference and alternate alleles, the reference position, and reference base. Yields: third_party.nucleus.protos.Variant proto in coordinate-sorted order containing gVCF records. """ def with_gq_and_likelihoods(summary_counts): """Returns summary_counts along with GQ and genotype likelihoods. If the reference base is not in CANONICAL_DNA_BASES, both GQ and genotype likelihoods are set to None. Args: summary_counts: A single AlleleCountSummary. Returns: A tuple of summary_counts, quantized GQ, raw GQ, and genotype likelihoods for summary_counts where raw GQ and genotype_likelihood are calculated by self.reference_confidence. Raises: ValueError: The reference base is not a valid DNA or IUPAC base. """ if summary_counts.ref_base not in CANONICAL_DNA_BASES: if summary_counts.ref_base in EXTENDED_IUPAC_CODES: # Skip calculating gq and likelihoods, since this is an ambiguous # reference base. quantized_gq, raw_gq, likelihoods = None, None, None else: raise ValueError( 'Invalid reference base={} found during gvcf ' 'calculation'.format(summary_counts.ref_base)) else: n_ref = summary_counts.ref_supporting_read_count n_total = summary_counts.total_read_count raw_gq, likelihoods = self.reference_confidence(n_ref, n_total) quantized_gq = _quantize_gq(raw_gq, self.options.gq_resolution) return summary_counts, quantized_gq, raw_gq, likelihoods # Combines contiguous, compatible single-bp blocks into larger gVCF blocks, # respecting non-reference variants interspersed among them. Yields each # combined gVCF Variant proto, in order. Compatible right now means that the # blocks to be merged have the same non-None GQ value. for key, combinable in itertools.groupby( (with_gq_and_likelihoods(sc) for sc in allele_count_summaries), key=operator.itemgetter(1)): if key is None: # A None key indicates that a non-DNA reference base was encountered, so # skip this group. continue combinable = list(combinable) min_gq = min(raw_gq_value for _, _, raw_gq_value, _ in combinable) summary_counts, _, _, likelihoods = combinable[0] call = variants_pb2.VariantCall( call_set_name=self.options.sample_name, genotype=[0, 0], genotype_likelihood=likelihoods) variantutils.set_variantcall_gq(call, min_gq) yield variants_pb2.Variant( reference_name=summary_counts.reference_name, reference_bases=summary_counts.ref_base, alternate_bases=[variantutils.GVCF_ALT_ALLELE], start=summary_counts.position, end=combinable[-1][0].position + 1, calls=[call])