示例#1
0
def label_variants(variants, truth_variants, ref):
  """Assigns genotypes to each variant to best match truth_variants.

  See the module-level documentation for general information on how this
  algorithm works.

  Args:
    variants: list[nucleus.protos.Variant]. A list of candidate variants, in
      coordinate-sorted order, all on the same chromosome.
    truth_variants: list[nucleus.protos.Variant]. A list of truth variants, in
      coordinate-sorted order, for the same interval on the genome as variants.
    ref: ReferenceRegion. Used to get reference bases for variants. Must cover
      at least the span of the variants.

  Returns:
    A list of new variants, copied from variants, but with their
    call[0].genotype field assigned values for the optimal labeling of variants.

  Raises:
    ValueError: If any inputs are malformed.
  """
  variants = list(variants)
  truth_variants = list(truth_variants)

  if _DEBUG_PRINTING_IS_ENABLED:
    print_variants('candidates', variants)
    print_variants('truth', truth_variants)

  if not variant_utils.variants_are_sorted(variants):
    raise ValueError('Variants are not sorted', variants)
  if not variant_utils.variants_are_sorted(truth_variants):
    raise ValueError('truth_variants are not sorted', truth_variants)

  truth_haplotypes = deduplicate_haplotypes(
      enumerate_all_possible_haplotypes(truth_variants, ref,
                                        EnumerationType.TRUTH))

  # redacted
  variant_haplotypes = enumerate_all_possible_haplotypes(
      variants, ref, EnumerationType.CANDIDATES)

  found = []
  for vh, vgt in variant_haplotypes:
    for th, tgt in truth_haplotypes:
      if th == vh:
        found.append(
            LabelerMatch(
                haplotypes=th,
                variants=variants,
                matched_variant_genotypes=vgt,
                truth_variants=truth_variants,
                matched_truth_genotypes=tgt))

  if not found:
    return None
  else:
    best = select_best_match(found)
    return best.variants_with_assigned_genotypes()
示例#2
0
def label_variants(variants, truth_variants, ref):
    """Assigns genotypes to each variant to best match truth_variants.

  See the module-level documentation for general information on how this
  algorithm works.

  Args:
    variants: list[nucleus.protos.Variant]. A list of candidate variants, in
      coordinate-sorted order, all on the same chromosome.
    truth_variants: list[nucleus.protos.Variant]. A list of truth variants, in
      coordinate-sorted order, for the same interval on the genome as variants.
    ref: ReferenceRegion. Used to get reference bases for variants. Must cover
      at least the span of the variants.

  Returns:
    A list of new variants, copied from variants, but with their
    call[0].genotype field assigned values for the optimal labeling of variants.

  Raises:
    ValueError: If any inputs are malformed.
  """
    variants = list(variants)
    truth_variants = list(truth_variants)

    if _DEBUG_PRINTING_IS_ENABLED:
        print_variants('candidates', variants)
        print_variants('truth', truth_variants)

    if not variant_utils.variants_are_sorted(variants):
        raise ValueError('Variants are not sorted', variants)
    if not variant_utils.variants_are_sorted(truth_variants):
        raise ValueError('truth_variants are not sorted', truth_variants)

    truth_haplotypes = deduplicate_haplotypes(
        enumerate_all_possible_haplotypes(truth_variants, ref,
                                          EnumerationType.TRUTH))

    # redacted
    variant_haplotypes = enumerate_all_possible_haplotypes(
        variants, ref, EnumerationType.CANDIDATES)

    found = []
    for vh, vgt in variant_haplotypes:
        for th, tgt in truth_haplotypes:
            if th == vh:
                found.append(
                    LabelerMatch(haplotypes=th,
                                 variants=variants,
                                 matched_variant_genotypes=vgt,
                                 truth_variants=truth_variants,
                                 matched_truth_genotypes=tgt))

    if not found:
        return None
    else:
        best = select_best_match(found)
        return best.variants_with_assigned_genotypes()
def _raise_if_not_sorted_or_not_on_same_chromosome(variants):
    """Raises a ValueError if variants isn't sorted on the same chromosome."""
    if not variant_utils.variants_are_sorted(variants):
        raise ValueError('Variants must be sorted', variants)
    for v in variants[1:]:
        if variants[0].reference_name != v.reference_name:
            raise ValueError(
                'Variants (v1={}, v2={}) not on the same chromosome'.format(
                    v.reference_name, variants[0].reference_name))
示例#4
0
  def test_sorted_variants(self, sorted_variants):
    for permutation in itertools.permutations(
        sorted_variants, r=len(sorted_variants)):

      # Check that sorting the permutations produced sorted.
      self.assertEqual(
          variant_utils.sorted_variants(permutation), sorted_variants)

      # Check that variants_are_sorted() is correct, which we detect if
      # the range_tuples of permutation == the range_tuples of sorted_variants.
      def _range_tuples(variants):
        return [variant_utils.variant_range_tuple(v) for v in variants]

      self.assertEqual(
          variant_utils.variants_are_sorted(permutation),
          _range_tuples(permutation) == _range_tuples(sorted_variants))
示例#5
0
  def test_sorted_variants(self, sorted_variants):
    for permutation in itertools.permutations(
        sorted_variants, r=len(sorted_variants)):

      # Check that sorting the permutations produced sorted.
      self.assertEqual(
          variant_utils.sorted_variants(permutation), sorted_variants)

      # Check that variants_are_sorted() is correct, which we detect if
      # the range_tuples of permutation == the range_tuples of sorted_variants.
      def _range_tuples(variants):
        return [variant_utils.variant_range_tuple(v) for v in variants]

      self.assertEqual(
          variant_utils.variants_are_sorted(permutation),
          _range_tuples(permutation) == _range_tuples(sorted_variants))
def find_best_matching_haplotypes(candidates, truths, ref):
    """Assigns genotypes to each variant to best match truths.

  See the module-level documentation for general information on how this
  algorithm works.

  Args:
    candidates: list[nucleus.protos.Variant]. A list of candidate variants, in
      coordinate-sorted order, all on the same chromosome.
    truths: list[nucleus.protos.Variant]. A list of truth variants, in
      coordinate-sorted order, for the same interval on the genome as variants.
    ref: ReferenceRegion. Used to get reference bases for variants. Must cover
      at least the span of the variants.

  Returns:
    A HaplotypeMatch object describing the best assignment of genotypes between
    the candidates and truth_variants, or None, if no consistent assignment can
    be found.

  Raises:
    ValueError: If any inputs are malformed.
  """
    candidates = list(candidates)
    truths = list(truths)

    if _DEBUG_PRINTING_IS_ENABLED:
        _log_variants('candidates', candidates)
        _log_variants('truth', truths)

    if not variant_utils.variants_are_sorted(candidates):
        raise ValueError('candidates are not sorted', candidates)
    if not variant_utils.variants_are_sorted(truths):
        raise ValueError('truths are not sorted', truths)

    def _hom_ref_enum_if_empty(list_of_variants, non_empty_enum):
        """If list_of_variants is empty, use a ONLY_HOM_REF enum for speed."""
        return non_empty_enum if list_of_variants else EnumerationType.ONLY_HOM_REF

    truth_haplotypes = deduplicate_haplotypes(
        enumerate_all_possible_haplotypes(
            truths, ref,
            _hom_ref_enum_if_empty(candidates, EnumerationType.TRUTH)))

    # Note, it may be worth deduplicating these haplotypes as well.
    variant_haplotypes = enumerate_all_possible_haplotypes(
        candidates, ref,
        _hom_ref_enum_if_empty(truths, EnumerationType.CANDIDATES))

    found = []
    for vh, vgt in variant_haplotypes:
        for th, tgt in truth_haplotypes:
            if th == vh:
                found.append(
                    HaplotypeMatch(haplotypes=th,
                                   candidates=candidates,
                                   candidate_genotypes=vgt,
                                   truths=truths,
                                   truth_genotypes=tgt))

    if not found:
        return None
    else:
        return select_best_haplotype_match(found)