def add_call_to_variant(variant, predictions, qual_filter=0, sample_name=None): """Fills in Variant record using the prediction probabilities. This functions sets the call[0].genotype, call[0].info['GQ'], call[0].genotype_probabilities, variant.filter, and variant.quality fields of variant based on the genotype likelihoods in predictions. Args: variant: third_party.nucleus.protos.Variant protobuf to be filled in with info derived from predictions. predictions: N element array-like. The real-space probabilities of each genotype state for this variant. qual_filter: float. If predictions implies that this isn't a reference call and the QUAL of the prediction isn't larger than qual_filter variant will be marked as FILTERed. sample_name: str. The name of the sample to assign to the Variant proto call_set_name field. Returns: A Variant record. Raises: ValueError: If variant doesn't have exactly one variant.call record. """ call = variant_utils.only_call(variant) n_alleles = len(variant.alternate_bases) + 1 index, genotype = most_likely_genotype(predictions, n_alleles=n_alleles) gq, variant.quality = compute_quals(predictions, index) call.call_set_name = sample_name variantcall_utils.set_gt(call, genotype) variantcall_utils.set_gq(call, gq) gls = [genomics_math.perror_to_bounded_log10_perror(gp) for gp in predictions] variantcall_utils.set_gl(call, gls) variant.filter[:] = compute_filter_fields(variant, qual_filter) return variant
def add_call_to_variant(variant, predictions, qual_filter=0, sample_name=None): """Fills in Variant record using the prediction probabilities. This functions sets the call[0].genotype, call[0].info['GQ'], call[0].genotype_probabilities, variant.filter, and variant.quality fields of variant based on the genotype likelihoods in predictions. Args: variant: third_party.nucleus.protos.Variant protobuf to be filled in with info derived from predictions. predictions: N element array-like. The real-space probabilities of each genotype state for this variant. qual_filter: float. If predictions implies that this isn't a reference call and the QUAL of the prediction isn't larger than qual_filter variant will be marked as FILTERed. sample_name: str. The name of the sample to assign to the Variant proto call_set_name field. Returns: A Variant record. Raises: ValueError: If variant doesn't have exactly one variant.call record. """ call = variant_utils.only_call(variant) n_alleles = len(variant.alternate_bases) + 1 index, genotype = most_likely_genotype(predictions, n_alleles=n_alleles) gq, variant.quality = compute_quals(predictions, index) call.call_set_name = sample_name variantcall_utils.set_gt(call, genotype) variantcall_utils.set_gq(call, gq) gls = [ genomics_math.perror_to_bounded_log10_perror(gp) for gp in predictions ] variantcall_utils.set_gl(call, gls) variant.filter[:] = compute_filter_fields(variant, qual_filter) uncall_homref_gt_if_lowqual(variant, FLAGS.cnn_homref_call_min_gq) return variant
def make_gvcfs(self, allele_count_summaries): """Primary interface function for computing gVCF confidence at a site. Looks at the counts in the provided list of AlleleCountSummary protos and returns properly-formatted Variant protos containing gVCF reference blocks for all sites in allele_count_summaries. The returned Variant has reference_name, start, end are set and contains a single VariantCall in the calls field with call_set_name of options.sample_name, genotypes set to 0/0 (diploid reference), a GQ value bound in the info field appropriate to the data in allele_count, and a MIN_DP value which is the minimum read coverage seen in the block. The provided allele count must have either a canonical DNA sequence base ( A, C, G, T) or be "N". Args: allele_count_summaries: iterable of AlleleCountSummary protos in coordinate-sorted order. Each proto is used to get the read counts for reference and alternate alleles, the reference position, and reference base. Yields: third_party.nucleus.protos.Variant proto in coordinate-sorted order containing gVCF records. """ def with_gq_and_likelihoods(summary_counts): """Returns summary_counts along with GQ and genotype likelihoods. If the reference base is not in CANONICAL_DNA_BASES, both GQ and genotype likelihoods are set to None. Args: summary_counts: A single AlleleCountSummary. Returns: A tuple of summary_counts, quantized GQ, raw GQ, and genotype likelihoods for summary_counts where raw GQ and genotype_likelihood are calculated by self.reference_confidence. Raises: ValueError: The reference base is not a valid DNA or IUPAC base. """ if summary_counts.ref_base not in CANONICAL_DNA_BASES: if summary_counts.ref_base in EXTENDED_IUPAC_CODES: # Skip calculating gq and likelihoods, since this is an ambiguous # reference base. quantized_gq, raw_gq, likelihoods = None, None, None n_total = summary_counts.total_read_count else: raise ValueError( 'Invalid reference base={} found during gvcf ' 'calculation'.format(summary_counts.ref_base)) else: n_ref = summary_counts.ref_supporting_read_count n_total = summary_counts.total_read_count raw_gq, likelihoods = self.reference_confidence(n_ref, n_total) quantized_gq = _quantize_gq(raw_gq, self.options.gq_resolution) return _GVCF(summary_counts=summary_counts, quantized_gq=quantized_gq, raw_gq=raw_gq, likelihoods=likelihoods, read_depth=n_total) # Combines contiguous, compatible single-bp blocks into larger gVCF blocks, # respecting non-reference variants interspersed among them. Yields each # combined gVCF Variant proto, in order. Compatible right now means that the # blocks to be merged have the same non-None GQ value. for key, combinable in itertools.groupby( (with_gq_and_likelihoods(sc) for sc in allele_count_summaries), key=operator.attrgetter('quantized_gq')): if key is None: # A None key indicates that a non-DNA reference base was encountered, so # skip this group. continue combinable = list(combinable) min_gq = min(elt.raw_gq for elt in combinable) min_dp = min(elt.read_depth for elt in combinable) first_record, last_record = combinable[0], combinable[-1] call = variants_pb2.VariantCall( call_set_name=self.options.sample_name, genotype=[0, 0], genotype_likelihood=first_record.likelihoods) variantcall_utils.set_gq(call, min_gq) variantcall_utils.set_min_dp(call, min_dp) yield variants_pb2.Variant( reference_name=first_record.summary_counts.reference_name, reference_bases=first_record.summary_counts.ref_base, alternate_bases=[vcf_constants.GVCF_ALT_ALLELE], start=first_record.summary_counts.position, end=last_record.summary_counts.position + 1, calls=[call])
def make_gvcfs(self, allele_count_summaries): """Primary interface function for computing gVCF confidence at a site. Looks at the counts in the provided list of AlleleCountSummary protos and returns properly-formatted Variant protos containing gVCF reference blocks for all sites in allele_count_summaries. The returned Variant has reference_name, start, end are set and contains a single VariantCall in the calls field with call_set_name of options.sample_name, genotypes set to 0/0 (diploid reference), a GQ value bound in the info field appropriate to the data in allele_count, and a MIN_DP value which is the minimum read coverage seen in the block. The provided allele count must have either a canonical DNA sequence base ( A, C, G, T) or be "N". Args: allele_count_summaries: iterable of AlleleCountSummary protos in coordinate-sorted order. Each proto is used to get the read counts for reference and alternate alleles, the reference position, and reference base. Yields: third_party.nucleus.protos.Variant proto in coordinate-sorted order containing gVCF records. """ def with_gq_and_likelihoods(summary_counts): """Returns summary_counts along with GQ and genotype likelihoods. If the reference base is not in CANONICAL_DNA_BASES, both GQ and genotype likelihoods are set to None. Args: summary_counts: A single AlleleCountSummary. Returns: A tuple of summary_counts, quantized GQ, raw GQ, and genotype likelihoods for summary_counts where raw GQ and genotype_likelihood are calculated by self.reference_confidence. Raises: ValueError: The reference base is not a valid DNA or IUPAC base. """ if summary_counts.ref_base not in CANONICAL_DNA_BASES: if summary_counts.ref_base in EXTENDED_IUPAC_CODES: # Skip calculating gq and likelihoods, since this is an ambiguous # reference base. quantized_gq, raw_gq, likelihoods = None, None, None n_total = summary_counts.total_read_count else: raise ValueError('Invalid reference base={} found during gvcf ' 'calculation'.format(summary_counts.ref_base)) else: n_ref = summary_counts.ref_supporting_read_count n_total = summary_counts.total_read_count raw_gq, likelihoods = self.reference_confidence(n_ref, n_total) quantized_gq = _quantize_gq(raw_gq, self.options.gq_resolution) return _GVCF( summary_counts=summary_counts, quantized_gq=quantized_gq, raw_gq=raw_gq, likelihoods=likelihoods, read_depth=n_total) # Combines contiguous, compatible single-bp blocks into larger gVCF blocks, # respecting non-reference variants interspersed among them. Yields each # combined gVCF Variant proto, in order. Compatible right now means that the # blocks to be merged have the same non-None GQ value. for key, combinable in itertools.groupby( (with_gq_and_likelihoods(sc) for sc in allele_count_summaries), key=operator.attrgetter('quantized_gq')): if key is None: # A None key indicates that a non-DNA reference base was encountered, so # skip this group. continue combinable = list(combinable) min_gq = min(elt.raw_gq for elt in combinable) min_dp = min(elt.read_depth for elt in combinable) first_record, last_record = combinable[0], combinable[-1] call = variants_pb2.VariantCall( call_set_name=self.options.sample_name, genotype=[0, 0], genotype_likelihood=first_record.likelihoods) variantcall_utils.set_gq(call, min_gq) variantcall_utils.set_min_dp(call, min_dp) yield variants_pb2.Variant( reference_name=first_record.summary_counts.reference_name, reference_bases=first_record.summary_counts.ref_base, alternate_bases=[vcf_constants.GVCF_ALT_ALLELE], start=first_record.summary_counts.position, end=last_record.summary_counts.position + 1, calls=[call])