def merge_predictions(call_variants_outputs,
                      qual_filter=None,
                      multiallelic_model=None):
    """Merges the predictions from the multi-allelic calls."""
    # See the logic described in the class PileupImageCreator pileup_image.py
    #
    # Because of the logic above, this function expects all cases above to have
    # genotype_predictions that we can combine from.
    if not call_variants_outputs:
        raise ValueError('Expected 1 or more call_variants_outputs.')

    if not is_valid_call_variants_outputs(call_variants_outputs):
        raise ValueError('`call_variants_outputs` did not pass sanity check.')

    first_call, other_calls = call_variants_outputs[0], call_variants_outputs[
        1:]
    canonical_variant = first_call.variant
    if not other_calls:
        canonical_variant = variant_utils.simplify_variant_alleles(
            canonical_variant)
        return canonical_variant, first_call.genotype_probabilities

    alt_alleles_to_remove = get_alt_alleles_to_remove(call_variants_outputs,
                                                      qual_filter)

    # flattened_probs_dict doesn't get used if we run the multiallelic model.
    flattened_probs_dict = convert_call_variants_outputs_to_probs_dict(
        canonical_variant, call_variants_outputs, alt_alleles_to_remove)

    canonical_variant = prune_alleles(canonical_variant, alt_alleles_to_remove)
    # Run alternate model for multiallelic cases.
    num_alts = len(canonical_variant.alternate_bases)
    if num_alts == 2 and multiallelic_model is not None:
        # We have 3 CVOs for 2 alts. In this case, there are 6 possible genotypes.
        cvo_probs = get_multiallelic_distributions(call_variants_outputs,
                                                   alt_alleles_to_remove)
        normalized_predictions = multiallelic_model(
            cvo_probs).numpy().tolist()[0]
    else:
        predictions = [
            min(flattened_probs_dict[(m, n)]) for _, _, m, n in
            variant_utils.genotype_ordering_in_likelihoods(canonical_variant)
        ]
        if sum(predictions) == 0:
            predictions = [1.0] * len(predictions)
        denominator = sum(predictions)
        normalized_predictions = [i / denominator for i in predictions]
    # Note the simplify_variant_alleles call *must* happen after the predictions
    # calculation above. flattened_probs_dict is indexed by alt allele, and
    # simplify can change those alleles so we cannot simplify until afterwards.
    canonical_variant = variant_utils.simplify_variant_alleles(
        canonical_variant)
    return canonical_variant, normalized_predictions
示例#2
0
    def _find_matching_variant_in_reader(self, variant):
        """Finds a variant in vcf_reader compatible with variant, if one exists."""
        region = variant_utils.variant_position(variant)
        matches = [
            variant_utils.simplify_variant_alleles(truth_variant)
            for truth_variant in self._get_truth_variants(region)
            if variant.start == truth_variant.start
        ]

        if not matches:
            return None

        best_match = None
        for match in matches:
            if (match.alternate_bases == variant.alternate_bases
                    and match.reference_bases == variant.reference_bases):
                best_match = match

        if best_match is None:
            logging.info(
                'Multiple matches detected; no good match found. Fall back '
                'to first. variant: %s: matches: %s', variant, matches)
            # redacted
            # likely not the best. Think about what to do for different use cases.
            best_match = matches[0]
        return best_match
示例#3
0
 def test_simplify_variant_alleles(self, alleles, start, expected_alleles,
                                   expected_end):
     """Test that simplify_variant_alleles works as expected."""
     variant = _create_variant_with_alleles(ref=alleles[0],
                                            alts=alleles[1:],
                                            start=start)
     simplified = variant_utils.simplify_variant_alleles(variant)
     self.assertEqual(simplified.reference_bases, expected_alleles[0])
     self.assertEqual(simplified.alternate_bases, expected_alleles[1:])
     self.assertEqual(simplified.start, start)
     self.assertEqual(simplified.end, expected_end)
示例#4
0
    def _match(self, variant):
        """Get a truth variant matching variant.

    A matching variant is defined here as one that starts at the same position
    on the genome as variant. The best match is then narrowed down by finding
    the variant with a matching alt allele, if it exists, otherwise the first
    matching variant is used regardless of alt alleles. This allows the client
    to make decisions on how to translate a matched between variant and
    truth_variant into a label (e.g. by comparing the alleles).

    If multiple variants are detected, this code will attempt to find the best
    match by comparing to `variant`. Note that some simplification of alleles
    are applied first before we compare. For example, 'GAAA->GAA' should be the
    same as 'GA->G'. If no good matches are detected, the logic currently falls
    back to the first element in matches.

    Args:
      variant: Our candidate third_party.nucleus.protos.Variant variant.

    Returns:
      A tuple of (match_status, truth_variant) where match_status is True if
      we are confident in our truth_variant call or False if not. truth_variant
      is a third_party.nucleus.protos.Variant object of
      the truth variant that matched
      variant, or None if none was found and we aren't confident in being
      hom-ref here, or a synthetic variant with the same position and alleles as
      variant but with a hom-ref genotype.
    """
        variant = variant_utils.simplify_variant_alleles(variant)
        matched_variant = self._find_matching_variant_in_reader(variant)
        confident_or_no_constraint = (
            self._confident_regions is None
            or self._confident_regions.variant_overlaps(
                variant, empty_set_return_value=False))
        if matched_variant is None and confident_or_no_constraint:
            matched_variant = self._make_synthetic_hom_ref(variant)
        return confident_or_no_constraint, matched_variant
示例#5
0
def match_candidate_and_cohort_haplotypes(candidate_haps,
                                          cohort_haps_and_freqs):
  """Match candidate haplotypes with cohort haplotypes and update frequency.

  First, we look for exact haplotype matches between candidate and cohorts.
  If there're any matches, the REF allele frequency associated with the matching
  ALT allele is updated as well.

  Second, if no matches are found, we try to find inexact matches, where only
  REF alleles are matched. The inexact matching step is only used to update REF
  allele frequency. If no exact and inexact matches are found, set REF allele
  frequency to 1.

  Args:
    candidate_haps: A list of haplotype objects from a candidate.
    cohort_haps_and_freqs: A list of haplotype objects from cohorts.
    Haplotype objects are stored as dicts:
      {'haplotype': a haplotype (string),
       'alt': an alt allele (string),
       'variant': an Variant proto}

  Returns:
    A dict with candidate alt alleles as keys, and associated frequencies
    as values.
  """
  dict_allele_frequency = {}
  for candidate_obj in candidate_haps:
    candidate_haplotype = candidate_obj['haplotype']
    candidate_alt = candidate_obj['alt']
    candidate_variant = candidate_obj['variant']

    for cohort_obj in cohort_haps_and_freqs:
      cohort_haplotype = cohort_obj['haplotype']
      # Exact haplotype match.
      if candidate_haplotype == cohort_haplotype:
        cohort_variant = cohort_obj['variant']
        cohort_frequency = get_allele_frequency(
            cohort_variant,
            list(cohort_variant.alternate_bases).index(cohort_obj['alt']))
        dict_allele_frequency[candidate_alt] = cohort_frequency

        # Update REF frequency if it is not in the dictionary.
        if not dict_allele_frequency.get(candidate_variant.reference_bases):
          dict_allele_frequency[candidate_variant.reference_bases] = \
              get_ref_allele_frequency(cohort_variant)

    # For an unmatched alt allele, set the frequency to 0.
    if not dict_allele_frequency.get(candidate_alt):
      dict_allele_frequency[candidate_alt] = 0

  # Calculate REF allele frequency if no exact match was found.
  # It is possible a novel mutation happens at a site where there are other
  # cohort variants. In this case, we cannot simply set REF frequency to 1.
  if sum(dict_allele_frequency.values()) == 0:
    candidate = candidate_haps[0]['variant']
    # Left align variants.
    s_candidate = variant_utils.simplify_variant_alleles(candidate)
    for cohort_obj in cohort_haps_and_freqs:
      s_cohort_variant = variant_utils.simplify_variant_alleles(
          cohort_obj['variant'])
      # Try to find inexact matches to set REF allele frequency.
      # Inexact matches here mean only REF alleles match.
      if s_candidate.start == s_cohort_variant.start and \
          s_candidate.reference_bases == s_cohort_variant.reference_bases:
        dict_allele_frequency[s_candidate.reference_bases] = \
            get_ref_allele_frequency(s_cohort_variant)

    # If still no match, set REF allele frequency to 1.
    if not dict_allele_frequency.get(candidate.reference_bases):
      dict_allele_frequency[candidate.reference_bases] = 1

  return dict_allele_frequency