Exemplo n.º 1
0
def add_call_to_variant(variant, predictions, qual_filter=0, sample_name=None):
  """Fills in Variant record using the prediction probabilities.

  This functions sets the call[0].genotype, call[0].info['GQ'],
  call[0].genotype_probabilities, variant.filter, and variant.quality fields of
  variant based on the genotype likelihoods in predictions.

  Args:
    variant: third_party.nucleus.protos.Variant protobuf
      to be filled in with info derived from predictions.
    predictions: N element array-like. The real-space probabilities of each
      genotype state for this variant.
    qual_filter: float. If predictions implies that this isn't a reference call
      and the QUAL of the prediction isn't larger than qual_filter variant will
      be marked as FILTERed.
    sample_name: str. The name of the sample to assign to the Variant proto
      call_set_name field.

  Returns:
    A Variant record.

  Raises:
    ValueError: If variant doesn't have exactly one variant.call record.
  """
  call = variant_utils.only_call(variant)
  n_alleles = len(variant.alternate_bases) + 1
  index, genotype = most_likely_genotype(predictions, n_alleles=n_alleles)
  gq, variant.quality = compute_quals(predictions, index)
  call.call_set_name = sample_name
  variantcall_utils.set_gt(call, genotype)
  variantcall_utils.set_gq(call, gq)
  gls = [genomics_math.perror_to_bounded_log10_perror(gp) for gp in predictions]
  variantcall_utils.set_gl(call, gls)
  variant.filter[:] = compute_filter_fields(variant, qual_filter)
  return variant
def add_call_to_variant(variant, predictions, qual_filter=0, sample_name=None):
    """Fills in Variant record using the prediction probabilities.

  This functions sets the call[0].genotype, call[0].info['GQ'],
  call[0].genotype_probabilities, variant.filter, and variant.quality fields of
  variant based on the genotype likelihoods in predictions.

  Args:
    variant: third_party.nucleus.protos.Variant protobuf
      to be filled in with info derived from predictions.
    predictions: N element array-like. The real-space probabilities of each
      genotype state for this variant.
    qual_filter: float. If predictions implies that this isn't a reference call
      and the QUAL of the prediction isn't larger than qual_filter variant will
      be marked as FILTERed.
    sample_name: str. The name of the sample to assign to the Variant proto
      call_set_name field.

  Returns:
    A Variant record.

  Raises:
    ValueError: If variant doesn't have exactly one variant.call record.
  """
    call = variant_utils.only_call(variant)
    n_alleles = len(variant.alternate_bases) + 1
    index, genotype = most_likely_genotype(predictions, n_alleles=n_alleles)
    gq, variant.quality = compute_quals(predictions, index)
    call.call_set_name = sample_name
    variantcall_utils.set_gt(call, genotype)
    variantcall_utils.set_gq(call, gq)
    gls = [
        genomics_math.perror_to_bounded_log10_perror(gp) for gp in predictions
    ]
    variantcall_utils.set_gl(call, gls)
    variant.filter[:] = compute_filter_fields(variant, qual_filter)
    uncall_homref_gt_if_lowqual(variant, FLAGS.cnn_homref_call_min_gq)
    return variant
Exemplo n.º 3
0
    def make_gvcfs(self, allele_count_summaries):
        """Primary interface function for computing gVCF confidence at a site.

    Looks at the counts in the provided list of AlleleCountSummary protos and
    returns properly-formatted Variant protos containing gVCF reference
    blocks for all sites in allele_count_summaries. The returned Variant has
    reference_name, start, end are set and contains a single VariantCall in the
    calls field with call_set_name of options.sample_name, genotypes set to 0/0
    (diploid reference), a GQ value bound in the info field appropriate to the
    data in allele_count, and a MIN_DP value which is the minimum read coverage
    seen in the block.

    The provided allele count must have either a canonical DNA sequence base (
    A, C, G, T) or be "N".

    Args:
      allele_count_summaries: iterable of AlleleCountSummary protos in
        coordinate-sorted order. Each proto is used to get the read counts for
        reference and alternate alleles, the reference position, and reference
        base.

    Yields:
      third_party.nucleus.protos.Variant proto in
      coordinate-sorted order containing gVCF records.
    """
        def with_gq_and_likelihoods(summary_counts):
            """Returns summary_counts along with GQ and genotype likelihoods.

      If the reference base is not in CANONICAL_DNA_BASES, both GQ and genotype
      likelihoods are set to None.

      Args:
        summary_counts: A single AlleleCountSummary.

      Returns:
        A tuple of summary_counts, quantized GQ, raw GQ, and genotype
        likelihoods for summary_counts where raw GQ and genotype_likelihood are
        calculated by self.reference_confidence.

      Raises:
        ValueError: The reference base is not a valid DNA or IUPAC base.
      """
            if summary_counts.ref_base not in CANONICAL_DNA_BASES:
                if summary_counts.ref_base in EXTENDED_IUPAC_CODES:
                    # Skip calculating gq and likelihoods, since this is an ambiguous
                    # reference base.
                    quantized_gq, raw_gq, likelihoods = None, None, None
                    n_total = summary_counts.total_read_count
                else:
                    raise ValueError(
                        'Invalid reference base={} found during gvcf '
                        'calculation'.format(summary_counts.ref_base))
            else:
                n_ref = summary_counts.ref_supporting_read_count
                n_total = summary_counts.total_read_count
                raw_gq, likelihoods = self.reference_confidence(n_ref, n_total)
                quantized_gq = _quantize_gq(raw_gq, self.options.gq_resolution)
            return _GVCF(summary_counts=summary_counts,
                         quantized_gq=quantized_gq,
                         raw_gq=raw_gq,
                         likelihoods=likelihoods,
                         read_depth=n_total)

        # Combines contiguous, compatible single-bp blocks into larger gVCF blocks,
        # respecting non-reference variants interspersed among them. Yields each
        # combined gVCF Variant proto, in order. Compatible right now means that the
        # blocks to be merged have the same non-None GQ value.
        for key, combinable in itertools.groupby(
            (with_gq_and_likelihoods(sc) for sc in allele_count_summaries),
                key=operator.attrgetter('quantized_gq')):
            if key is None:
                # A None key indicates that a non-DNA reference base was encountered, so
                # skip this group.
                continue
            combinable = list(combinable)
            min_gq = min(elt.raw_gq for elt in combinable)
            min_dp = min(elt.read_depth for elt in combinable)
            first_record, last_record = combinable[0], combinable[-1]
            call = variants_pb2.VariantCall(
                call_set_name=self.options.sample_name,
                genotype=[0, 0],
                genotype_likelihood=first_record.likelihoods)
            variantcall_utils.set_gq(call, min_gq)
            variantcall_utils.set_min_dp(call, min_dp)
            yield variants_pb2.Variant(
                reference_name=first_record.summary_counts.reference_name,
                reference_bases=first_record.summary_counts.ref_base,
                alternate_bases=[vcf_constants.GVCF_ALT_ALLELE],
                start=first_record.summary_counts.position,
                end=last_record.summary_counts.position + 1,
                calls=[call])
Exemplo n.º 4
0
  def make_gvcfs(self, allele_count_summaries):
    """Primary interface function for computing gVCF confidence at a site.

    Looks at the counts in the provided list of AlleleCountSummary protos and
    returns properly-formatted Variant protos containing gVCF reference
    blocks for all sites in allele_count_summaries. The returned Variant has
    reference_name, start, end are set and contains a single VariantCall in the
    calls field with call_set_name of options.sample_name, genotypes set to 0/0
    (diploid reference), a GQ value bound in the info field appropriate to the
    data in allele_count, and a MIN_DP value which is the minimum read coverage
    seen in the block.

    The provided allele count must have either a canonical DNA sequence base (
    A, C, G, T) or be "N".

    Args:
      allele_count_summaries: iterable of AlleleCountSummary protos in
        coordinate-sorted order. Each proto is used to get the read counts for
        reference and alternate alleles, the reference position, and reference
        base.

    Yields:
      third_party.nucleus.protos.Variant proto in
      coordinate-sorted order containing gVCF records.
    """

    def with_gq_and_likelihoods(summary_counts):
      """Returns summary_counts along with GQ and genotype likelihoods.

      If the reference base is not in CANONICAL_DNA_BASES, both GQ and genotype
      likelihoods are set to None.

      Args:
        summary_counts: A single AlleleCountSummary.

      Returns:
        A tuple of summary_counts, quantized GQ, raw GQ, and genotype
        likelihoods for summary_counts where raw GQ and genotype_likelihood are
        calculated by self.reference_confidence.

      Raises:
        ValueError: The reference base is not a valid DNA or IUPAC base.
      """
      if summary_counts.ref_base not in CANONICAL_DNA_BASES:
        if summary_counts.ref_base in EXTENDED_IUPAC_CODES:
          # Skip calculating gq and likelihoods, since this is an ambiguous
          # reference base.
          quantized_gq, raw_gq, likelihoods = None, None, None
          n_total = summary_counts.total_read_count
        else:
          raise ValueError('Invalid reference base={} found during gvcf '
                           'calculation'.format(summary_counts.ref_base))
      else:
        n_ref = summary_counts.ref_supporting_read_count
        n_total = summary_counts.total_read_count
        raw_gq, likelihoods = self.reference_confidence(n_ref, n_total)
        quantized_gq = _quantize_gq(raw_gq, self.options.gq_resolution)
      return _GVCF(
          summary_counts=summary_counts,
          quantized_gq=quantized_gq,
          raw_gq=raw_gq,
          likelihoods=likelihoods,
          read_depth=n_total)

    # Combines contiguous, compatible single-bp blocks into larger gVCF blocks,
    # respecting non-reference variants interspersed among them. Yields each
    # combined gVCF Variant proto, in order. Compatible right now means that the
    # blocks to be merged have the same non-None GQ value.
    for key, combinable in itertools.groupby(
        (with_gq_and_likelihoods(sc) for sc in allele_count_summaries),
        key=operator.attrgetter('quantized_gq')):
      if key is None:
        # A None key indicates that a non-DNA reference base was encountered, so
        # skip this group.
        continue
      combinable = list(combinable)
      min_gq = min(elt.raw_gq for elt in combinable)
      min_dp = min(elt.read_depth for elt in combinable)
      first_record, last_record = combinable[0], combinable[-1]
      call = variants_pb2.VariantCall(
          call_set_name=self.options.sample_name,
          genotype=[0, 0],
          genotype_likelihood=first_record.likelihoods)
      variantcall_utils.set_gq(call, min_gq)
      variantcall_utils.set_min_dp(call, min_dp)
      yield variants_pb2.Variant(
          reference_name=first_record.summary_counts.reference_name,
          reference_bases=first_record.summary_counts.ref_base,
          alternate_bases=[vcf_constants.GVCF_ALT_ALLELE],
          start=first_record.summary_counts.position,
          end=last_record.summary_counts.position + 1,
          calls=[call])