def merge_predictions(call_variants_outputs, qual_filter=None): """Merges the predictions from the multi-allelic calls.""" # See the logic described in the class PileupImageCreator pileup_image.py # # Because of the logic above, this function expects all cases above to have # genotype_predictions that we can combine from. if not call_variants_outputs: raise ValueError('Expected 1 or more call_variants_outputs.') if not is_valid_call_variants_outputs(call_variants_outputs): raise ValueError('`call_variants_outputs` did not pass sanity check.') first_call, other_calls = call_variants_outputs[0], call_variants_outputs[ 1:] canonical_variant = first_call.variant if not other_calls: return canonical_variant, first_call.genotype_probabilities alt_alleles_to_remove = get_alt_alleles_to_remove(call_variants_outputs, qual_filter) flattened_probs_dict = convert_call_variants_outputs_to_probs_dict( canonical_variant, call_variants_outputs, alt_alleles_to_remove) canonical_variant = prune_alleles(canonical_variant, alt_alleles_to_remove) predictions = [ min(flattened_probs_dict[(m, n)]) for _, _, m, n in variant_utils.genotype_ordering_in_likelihoods(canonical_variant) ] denominator = sum(predictions) # Note the simplify_alleles call *must* happen after the predictions # calculation above. flattened_probs_dict is indexed by alt allele, and # simplify can change those alleles so we cannot simplify until afterwards. canonical_variant = simplify_alleles(canonical_variant) return canonical_variant, [i / denominator for i in predictions]
def merge_predictions(call_variants_outputs, qual_filter=None): """Merges the predictions from the multi-allelic calls.""" # See the logic described in the class PileupImageCreator pileup_image.py # # Because of the logic above, this function expects all cases above to have # genotype_predictions that we can combine from. if not call_variants_outputs: raise ValueError('Expected 1 or more call_variants_outputs.') if not is_valid_call_variants_outputs(call_variants_outputs): raise ValueError('`call_variants_outputs` did not pass sanity check.') first_call, other_calls = call_variants_outputs[0], call_variants_outputs[1:] canonical_variant = first_call.variant if not other_calls: return canonical_variant, first_call.genotype_probabilities alt_alleles_to_remove = get_alt_alleles_to_remove(call_variants_outputs, qual_filter) flattened_probs_dict = convert_call_variants_outputs_to_probs_dict( canonical_variant, call_variants_outputs, alt_alleles_to_remove) canonical_variant = prune_alleles(canonical_variant, alt_alleles_to_remove) predictions = [ min(flattened_probs_dict[(m, n)]) for _, _, m, n in variant_utils.genotype_ordering_in_likelihoods(canonical_variant) ] denominator = sum(predictions) # Note the simplify_alleles call *must* happen after the predictions # calculation above. flattened_probs_dict is indexed by alt allele, and # simplify can change those alleles so we cannot simplify until afterwards. canonical_variant = simplify_alleles(canonical_variant) return canonical_variant, [i / denominator for i in predictions]
def merge_predictions(call_variants_outputs, qual_filter=None, multiallelic_model=None): """Merges the predictions from the multi-allelic calls.""" # See the logic described in the class PileupImageCreator pileup_image.py # # Because of the logic above, this function expects all cases above to have # genotype_predictions that we can combine from. if not call_variants_outputs: raise ValueError('Expected 1 or more call_variants_outputs.') if not is_valid_call_variants_outputs(call_variants_outputs): raise ValueError('`call_variants_outputs` did not pass sanity check.') first_call, other_calls = call_variants_outputs[0], call_variants_outputs[ 1:] canonical_variant = first_call.variant if not other_calls: canonical_variant = variant_utils.simplify_variant_alleles( canonical_variant) return canonical_variant, first_call.genotype_probabilities alt_alleles_to_remove = get_alt_alleles_to_remove(call_variants_outputs, qual_filter) # flattened_probs_dict doesn't get used if we run the multiallelic model. flattened_probs_dict = convert_call_variants_outputs_to_probs_dict( canonical_variant, call_variants_outputs, alt_alleles_to_remove) canonical_variant = prune_alleles(canonical_variant, alt_alleles_to_remove) # Run alternate model for multiallelic cases. num_alts = len(canonical_variant.alternate_bases) if num_alts == 2 and multiallelic_model is not None: # We have 3 CVOs for 2 alts. In this case, there are 6 possible genotypes. cvo_probs = get_multiallelic_distributions(call_variants_outputs, alt_alleles_to_remove) normalized_predictions = multiallelic_model( cvo_probs).numpy().tolist()[0] else: predictions = [ min(flattened_probs_dict[(m, n)]) for _, _, m, n in variant_utils.genotype_ordering_in_likelihoods(canonical_variant) ] if sum(predictions) == 0: predictions = [1.0] * len(predictions) denominator = sum(predictions) normalized_predictions = [i / denominator for i in predictions] # Note the simplify_variant_alleles call *must* happen after the predictions # calculation above. flattened_probs_dict is indexed by alt allele, and # simplify can change those alleles so we cannot simplify until afterwards. canonical_variant = variant_utils.simplify_variant_alleles( canonical_variant) return canonical_variant, normalized_predictions
def genotype_options_for_variants(variants, enumeration_type): if enumeration_type == EnumerationType.TRUTH: return [ with_false_negative_genotypes(x) for x in _variant_genotypes(variants) ] elif enumeration_type == EnumerationType.CANDIDATES: return [[ (i, j) for i, j, _, _ in variant_utils.genotype_ordering_in_likelihoods(v) ] for v in variants] else: raise ValueError('Unexpected EnumerationType', enumeration_type)
def genotype_options_for_variants(variants, enumeration_type): if enumeration_type == EnumerationType.TRUTH: return [ with_false_negative_genotypes(x) for x in _variant_genotypes(variants) ] elif enumeration_type == EnumerationType.CANDIDATES: return [ [(i, j) for i, j, _, _ in variant_utils.genotype_ordering_in_likelihoods(v)] for v in variants ] else: raise ValueError('Unexpected EnumerationType', enumeration_type)
def genotype_options_for_variants(variants, enumeration_type): """Returns a list of sets of possible genotypes for each variant in variants. This function takes a list of variants and enumeration_type and produces a list of possible genotypes for each variant in order. If enumeration_type is ONLY_HOM_REF, then we return a singleton set for each variant containing only the hom-ref genotype (0, 0). If enumeration_type is TRUTH, then each variant must have an associated genotype field values, say (A, B), and we return the set genotype as well as all possible false negative genotypes. In our example, this means we'd return {(A, B), (0, A), (0, B), (0, 0)} as we could miss either the A, the B, or both alleles. If the enumeration_type is CANDIDATES, we don't require the Variant protos to have existing genotype field values and instead enumerate all possible unphased genotypes for each variant given its alternative alleles of each variant. For example, if we have a Variant with alleles = 'A' and 'C', we would return the three possible diploid genotypes {(0, 0), (0, 1), (1, 1)}. Args: variants: List[nucleus.protos.Variant]. A list of Variant protos to provide genotype options for. Some enumeration types may require the protos to have existing genotypes in their calls[] subfield. enumeration_type: EnumerationType. The kind of genotypes we want to explore for each variant. Returns: A list of sets with the same length and "order" as variants. Each set contains one or more diploid genotype tuples [e.g., (0, 1)] that collectively represent the possible genotypes we need to explore. Raises: ValueError: if enumeration_type isn't one of the valid options. """ if enumeration_type == EnumerationType.TRUTH: return [ with_false_negative_genotypes(x) for x in _variant_genotypes(variants) ] elif enumeration_type == EnumerationType.CANDIDATES: return [{ (i, j) for i, j, _, _ in variant_utils.genotype_ordering_in_likelihoods(v) } for v in variants] elif enumeration_type == EnumerationType.ONLY_HOM_REF: return [{(0, 0)}] * len(variants) else: raise ValueError('Unexpected EnumerationType', enumeration_type)
def test_genotype_ordering_in_likelihoods(self, variant, expected): self.assertEqual( list(variant_utils.genotype_ordering_in_likelihoods(variant)), expected)
def test_genotype_ordering_in_likelihoods(self, variant, expected): self.assertEqual( list(variant_utils.genotype_ordering_in_likelihoods(variant)), expected)