def label_variants(variants, truth_variants, ref): """Assigns genotypes to each variant to best match truth_variants. See the module-level documentation for general information on how this algorithm works. Args: variants: list[nucleus.protos.Variant]. A list of candidate variants, in coordinate-sorted order, all on the same chromosome. truth_variants: list[nucleus.protos.Variant]. A list of truth variants, in coordinate-sorted order, for the same interval on the genome as variants. ref: ReferenceRegion. Used to get reference bases for variants. Must cover at least the span of the variants. Returns: A list of new variants, copied from variants, but with their call[0].genotype field assigned values for the optimal labeling of variants. Raises: ValueError: If any inputs are malformed. """ variants = list(variants) truth_variants = list(truth_variants) if _DEBUG_PRINTING_IS_ENABLED: print_variants('candidates', variants) print_variants('truth', truth_variants) if not variant_utils.variants_are_sorted(variants): raise ValueError('Variants are not sorted', variants) if not variant_utils.variants_are_sorted(truth_variants): raise ValueError('truth_variants are not sorted', truth_variants) truth_haplotypes = deduplicate_haplotypes( enumerate_all_possible_haplotypes(truth_variants, ref, EnumerationType.TRUTH)) # redacted variant_haplotypes = enumerate_all_possible_haplotypes( variants, ref, EnumerationType.CANDIDATES) found = [] for vh, vgt in variant_haplotypes: for th, tgt in truth_haplotypes: if th == vh: found.append( LabelerMatch( haplotypes=th, variants=variants, matched_variant_genotypes=vgt, truth_variants=truth_variants, matched_truth_genotypes=tgt)) if not found: return None else: best = select_best_match(found) return best.variants_with_assigned_genotypes()
def label_variants(variants, truth_variants, ref): """Assigns genotypes to each variant to best match truth_variants. See the module-level documentation for general information on how this algorithm works. Args: variants: list[nucleus.protos.Variant]. A list of candidate variants, in coordinate-sorted order, all on the same chromosome. truth_variants: list[nucleus.protos.Variant]. A list of truth variants, in coordinate-sorted order, for the same interval on the genome as variants. ref: ReferenceRegion. Used to get reference bases for variants. Must cover at least the span of the variants. Returns: A list of new variants, copied from variants, but with their call[0].genotype field assigned values for the optimal labeling of variants. Raises: ValueError: If any inputs are malformed. """ variants = list(variants) truth_variants = list(truth_variants) if _DEBUG_PRINTING_IS_ENABLED: print_variants('candidates', variants) print_variants('truth', truth_variants) if not variant_utils.variants_are_sorted(variants): raise ValueError('Variants are not sorted', variants) if not variant_utils.variants_are_sorted(truth_variants): raise ValueError('truth_variants are not sorted', truth_variants) truth_haplotypes = deduplicate_haplotypes( enumerate_all_possible_haplotypes(truth_variants, ref, EnumerationType.TRUTH)) # redacted variant_haplotypes = enumerate_all_possible_haplotypes( variants, ref, EnumerationType.CANDIDATES) found = [] for vh, vgt in variant_haplotypes: for th, tgt in truth_haplotypes: if th == vh: found.append( LabelerMatch(haplotypes=th, variants=variants, matched_variant_genotypes=vgt, truth_variants=truth_variants, matched_truth_genotypes=tgt)) if not found: return None else: best = select_best_match(found) return best.variants_with_assigned_genotypes()
def _raise_if_not_sorted_or_not_on_same_chromosome(variants): """Raises a ValueError if variants isn't sorted on the same chromosome.""" if not variant_utils.variants_are_sorted(variants): raise ValueError('Variants must be sorted', variants) for v in variants[1:]: if variants[0].reference_name != v.reference_name: raise ValueError( 'Variants (v1={}, v2={}) not on the same chromosome'.format( v.reference_name, variants[0].reference_name))
def test_sorted_variants(self, sorted_variants): for permutation in itertools.permutations( sorted_variants, r=len(sorted_variants)): # Check that sorting the permutations produced sorted. self.assertEqual( variant_utils.sorted_variants(permutation), sorted_variants) # Check that variants_are_sorted() is correct, which we detect if # the range_tuples of permutation == the range_tuples of sorted_variants. def _range_tuples(variants): return [variant_utils.variant_range_tuple(v) for v in variants] self.assertEqual( variant_utils.variants_are_sorted(permutation), _range_tuples(permutation) == _range_tuples(sorted_variants))
def find_best_matching_haplotypes(candidates, truths, ref): """Assigns genotypes to each variant to best match truths. See the module-level documentation for general information on how this algorithm works. Args: candidates: list[nucleus.protos.Variant]. A list of candidate variants, in coordinate-sorted order, all on the same chromosome. truths: list[nucleus.protos.Variant]. A list of truth variants, in coordinate-sorted order, for the same interval on the genome as variants. ref: ReferenceRegion. Used to get reference bases for variants. Must cover at least the span of the variants. Returns: A HaplotypeMatch object describing the best assignment of genotypes between the candidates and truth_variants, or None, if no consistent assignment can be found. Raises: ValueError: If any inputs are malformed. """ candidates = list(candidates) truths = list(truths) if _DEBUG_PRINTING_IS_ENABLED: _log_variants('candidates', candidates) _log_variants('truth', truths) if not variant_utils.variants_are_sorted(candidates): raise ValueError('candidates are not sorted', candidates) if not variant_utils.variants_are_sorted(truths): raise ValueError('truths are not sorted', truths) def _hom_ref_enum_if_empty(list_of_variants, non_empty_enum): """If list_of_variants is empty, use a ONLY_HOM_REF enum for speed.""" return non_empty_enum if list_of_variants else EnumerationType.ONLY_HOM_REF truth_haplotypes = deduplicate_haplotypes( enumerate_all_possible_haplotypes( truths, ref, _hom_ref_enum_if_empty(candidates, EnumerationType.TRUTH))) # Note, it may be worth deduplicating these haplotypes as well. variant_haplotypes = enumerate_all_possible_haplotypes( candidates, ref, _hom_ref_enum_if_empty(truths, EnumerationType.CANDIDATES)) found = [] for vh, vgt in variant_haplotypes: for th, tgt in truth_haplotypes: if th == vh: found.append( HaplotypeMatch(haplotypes=th, candidates=candidates, candidate_genotypes=vgt, truths=truths, truth_genotypes=tgt)) if not found: return None else: return select_best_haplotype_match(found)