示例#1
0
def merge_predictions(call_variants_outputs, qual_filter=None):
    """Merges the predictions from the multi-allelic calls."""
    # See the logic described in the class PileupImageCreator pileup_image.py
    #
    # Because of the logic above, this function expects all cases above to have
    # genotype_predictions that we can combine from.
    if not call_variants_outputs:
        raise ValueError('Expected 1 or more call_variants_outputs.')

    if not is_valid_call_variants_outputs(call_variants_outputs):
        raise ValueError('`call_variants_outputs` did not pass sanity check.')

    first_call, other_calls = call_variants_outputs[0], call_variants_outputs[
        1:]
    canonical_variant = first_call.variant
    if not other_calls:
        return canonical_variant, first_call.genotype_probabilities

    alt_alleles_to_remove = get_alt_alleles_to_remove(call_variants_outputs,
                                                      qual_filter)
    flattened_probs_dict = convert_call_variants_outputs_to_probs_dict(
        canonical_variant, call_variants_outputs, alt_alleles_to_remove)

    canonical_variant = prune_alleles(canonical_variant, alt_alleles_to_remove)
    predictions = [
        min(flattened_probs_dict[(m, n)]) for _, _, m, n in
        variant_utils.genotype_ordering_in_likelihoods(canonical_variant)
    ]
    denominator = sum(predictions)
    # Note the simplify_alleles call *must* happen after the predictions
    # calculation above. flattened_probs_dict is indexed by alt allele, and
    # simplify can change those alleles so we cannot simplify until afterwards.
    canonical_variant = simplify_alleles(canonical_variant)
    return canonical_variant, [i / denominator for i in predictions]
def merge_predictions(call_variants_outputs, qual_filter=None):
  """Merges the predictions from the multi-allelic calls."""
  # See the logic described in the class PileupImageCreator pileup_image.py
  #
  # Because of the logic above, this function expects all cases above to have
  # genotype_predictions that we can combine from.
  if not call_variants_outputs:
    raise ValueError('Expected 1 or more call_variants_outputs.')

  if not is_valid_call_variants_outputs(call_variants_outputs):
    raise ValueError('`call_variants_outputs` did not pass sanity check.')

  first_call, other_calls = call_variants_outputs[0], call_variants_outputs[1:]
  canonical_variant = first_call.variant
  if not other_calls:
    return canonical_variant, first_call.genotype_probabilities

  alt_alleles_to_remove = get_alt_alleles_to_remove(call_variants_outputs,
                                                    qual_filter)
  flattened_probs_dict = convert_call_variants_outputs_to_probs_dict(
      canonical_variant, call_variants_outputs, alt_alleles_to_remove)

  canonical_variant = prune_alleles(canonical_variant, alt_alleles_to_remove)
  predictions = [
      min(flattened_probs_dict[(m, n)]) for _, _, m, n in
      variant_utils.genotype_ordering_in_likelihoods(canonical_variant)
  ]
  denominator = sum(predictions)
  # Note the simplify_alleles call *must* happen after the predictions
  # calculation above. flattened_probs_dict is indexed by alt allele, and
  # simplify can change those alleles so we cannot simplify until afterwards.
  canonical_variant = simplify_alleles(canonical_variant)
  return canonical_variant, [i / denominator for i in predictions]
def merge_predictions(call_variants_outputs,
                      qual_filter=None,
                      multiallelic_model=None):
    """Merges the predictions from the multi-allelic calls."""
    # See the logic described in the class PileupImageCreator pileup_image.py
    #
    # Because of the logic above, this function expects all cases above to have
    # genotype_predictions that we can combine from.
    if not call_variants_outputs:
        raise ValueError('Expected 1 or more call_variants_outputs.')

    if not is_valid_call_variants_outputs(call_variants_outputs):
        raise ValueError('`call_variants_outputs` did not pass sanity check.')

    first_call, other_calls = call_variants_outputs[0], call_variants_outputs[
        1:]
    canonical_variant = first_call.variant
    if not other_calls:
        canonical_variant = variant_utils.simplify_variant_alleles(
            canonical_variant)
        return canonical_variant, first_call.genotype_probabilities

    alt_alleles_to_remove = get_alt_alleles_to_remove(call_variants_outputs,
                                                      qual_filter)

    # flattened_probs_dict doesn't get used if we run the multiallelic model.
    flattened_probs_dict = convert_call_variants_outputs_to_probs_dict(
        canonical_variant, call_variants_outputs, alt_alleles_to_remove)

    canonical_variant = prune_alleles(canonical_variant, alt_alleles_to_remove)
    # Run alternate model for multiallelic cases.
    num_alts = len(canonical_variant.alternate_bases)
    if num_alts == 2 and multiallelic_model is not None:
        # We have 3 CVOs for 2 alts. In this case, there are 6 possible genotypes.
        cvo_probs = get_multiallelic_distributions(call_variants_outputs,
                                                   alt_alleles_to_remove)
        normalized_predictions = multiallelic_model(
            cvo_probs).numpy().tolist()[0]
    else:
        predictions = [
            min(flattened_probs_dict[(m, n)]) for _, _, m, n in
            variant_utils.genotype_ordering_in_likelihoods(canonical_variant)
        ]
        if sum(predictions) == 0:
            predictions = [1.0] * len(predictions)
        denominator = sum(predictions)
        normalized_predictions = [i / denominator for i in predictions]
    # Note the simplify_variant_alleles call *must* happen after the predictions
    # calculation above. flattened_probs_dict is indexed by alt allele, and
    # simplify can change those alleles so we cannot simplify until afterwards.
    canonical_variant = variant_utils.simplify_variant_alleles(
        canonical_variant)
    return canonical_variant, normalized_predictions
示例#4
0
def genotype_options_for_variants(variants, enumeration_type):
    if enumeration_type == EnumerationType.TRUTH:
        return [
            with_false_negative_genotypes(x)
            for x in _variant_genotypes(variants)
        ]
    elif enumeration_type == EnumerationType.CANDIDATES:
        return [[
            (i, j)
            for i, j, _, _ in variant_utils.genotype_ordering_in_likelihoods(v)
        ] for v in variants]
    else:
        raise ValueError('Unexpected EnumerationType', enumeration_type)
示例#5
0
def genotype_options_for_variants(variants, enumeration_type):
  if enumeration_type == EnumerationType.TRUTH:
    return [
        with_false_negative_genotypes(x) for x in _variant_genotypes(variants)
    ]
  elif enumeration_type == EnumerationType.CANDIDATES:
    return [
        [(i, j)
         for i, j, _, _ in variant_utils.genotype_ordering_in_likelihoods(v)]
        for v in variants
    ]
  else:
    raise ValueError('Unexpected EnumerationType', enumeration_type)
def genotype_options_for_variants(variants, enumeration_type):
    """Returns a list of sets of possible genotypes for each variant in variants.

  This function takes a list of variants and enumeration_type and produces a
  list of possible genotypes for each variant in order.

  If enumeration_type is ONLY_HOM_REF, then we return a singleton set for each
  variant containing only the hom-ref genotype (0, 0). If enumeration_type is
  TRUTH, then each variant must have an associated genotype field values, say
  (A, B), and we return the set genotype as well as all possible false negative
  genotypes. In our example, this means we'd return {(A, B), (0, A), (0, B),
  (0, 0)} as we could miss either the A, the B, or both alleles. If the
  enumeration_type is CANDIDATES, we don't require the Variant protos to have
  existing genotype field values and instead enumerate all possible unphased
  genotypes for each variant given its alternative alleles of each variant. For
  example, if we have a Variant with alleles = 'A' and 'C', we would return the
  three possible diploid genotypes {(0, 0), (0, 1), (1, 1)}.

  Args:
    variants: List[nucleus.protos.Variant]. A list of Variant protos to provide
      genotype options for. Some enumeration types may require the protos to
      have existing genotypes in their calls[] subfield.
    enumeration_type: EnumerationType. The kind of genotypes we want to explore
      for each variant.

  Returns:
    A list of sets with the same length and "order" as variants. Each set
    contains one or more diploid genotype tuples [e.g., (0, 1)] that
    collectively represent the possible genotypes we need to explore.

  Raises:
    ValueError: if enumeration_type isn't one of the valid options.
  """
    if enumeration_type == EnumerationType.TRUTH:
        return [
            with_false_negative_genotypes(x)
            for x in _variant_genotypes(variants)
        ]
    elif enumeration_type == EnumerationType.CANDIDATES:
        return [{
            (i, j)
            for i, j, _, _ in variant_utils.genotype_ordering_in_likelihoods(v)
        } for v in variants]
    elif enumeration_type == EnumerationType.ONLY_HOM_REF:
        return [{(0, 0)}] * len(variants)
    else:
        raise ValueError('Unexpected EnumerationType', enumeration_type)
示例#7
0
 def test_genotype_ordering_in_likelihoods(self, variant, expected):
   self.assertEqual(
       list(variant_utils.genotype_ordering_in_likelihoods(variant)), expected)
示例#8
0
 def test_genotype_ordering_in_likelihoods(self, variant, expected):
   self.assertEqual(
       list(variant_utils.genotype_ordering_in_likelihoods(variant)), expected)