def test_vcf_caller_end2end_outputs(self):
    # Confirming that the proposed VCF (input) has the same variants
    # as the VCF output converted from the output of make_examples.
    variants = list(
        labeled_examples_to_vcf.examples_to_variants(
            testdata.GOLDEN_VCF_CALLER_TRAINING_EXAMPLES))
    with vcf.VcfReader(testdata.TRUTH_VARIANTS_VCF) as proposed_vcf_reader:
      # This checks the keys (like chr20:10099832:A->G) are the same.
      self.assertEqual([variant_utils.variant_key(v1) for v1 in variants], [
          variant_utils.variant_key(v2) for v2 in proposed_vcf_reader.iterate()
      ])

    with vcf.VcfReader(testdata.TRUTH_VARIANTS_VCF) as proposed_vcf_reader:
      self.assertEqual(
          [variant_utils.genotype_as_alleles(v1) for v1 in variants], [
              variant_utils.genotype_as_alleles(
                  variant_utils.unphase_all_genotypes(v2))
              for v2 in proposed_vcf_reader.iterate()
          ])
Пример #2
0
 def test_genotype_as_alleles_raises_with_bad_inputs(self):
   with self.assertRaises(Exception):
     variant_utils.genotype_as_alleles(None)
   with self.assertRaises(Exception):
     variant_utils.genotype_as_alleles(test_utils.make_variant(gt=None))
   with self.assertRaises(Exception):
     variant_utils.genotype_as_alleles(
         test_utils.make_variant(alleles=['A', 'C'], gt=[0, 0]), call_ix=1)
   with self.assertRaises(Exception):
     variant_utils.genotype_type(None)
Пример #3
0
 def test_genotype_as_alleles_raises_with_bad_inputs(self):
   with self.assertRaises(Exception):
     variant_utils.genotype_as_alleles(None)
   with self.assertRaises(Exception):
     variant_utils.genotype_as_alleles(test_utils.make_variant(gt=None))
   with self.assertRaises(Exception):
     variant_utils.genotype_as_alleles(
         test_utils.make_variant(alleles=['A', 'C'], gt=[0, 0]), call_ix=1)
   with self.assertRaises(Exception):
     variant_utils.genotype_type(None)
Пример #4
0
 def test_genotype_as_alleles(self, variant, expected):
   self.assertEqual(variant_utils.genotype_as_alleles(variant), expected)
Пример #5
0
 def test_genotype_as_alleles(self, variant, expected):
   self.assertEqual(variant_utils.genotype_as_alleles(variant), expected)
Пример #6
0
def _genotype_from_matched_truth(candidate_variant, truth_variant):
    """Gets the diploid genotype for candidate_variant from matched truth_variant.

  This method figures out the genotype for candidate_variant by matching alleles
  in candidate_variant with those used by the genotype assigned to
  truth_variant. For example, if candidate is A/C and truth is A/C with a 0/1
  genotype, then this function would return (0, 1) indicating that there's one
  copy of the A allele and one of C in truth. If the true genotype is 1/1, then
  this routine would return (1, 1).

  The routine allows candidate_variant and truth_variant to differ in both
  the number of alternate alleles, and even in the representation of the same
  alleles due to those differences. For example, candidate could be:

      AGT/A/AGTGT => 2 bp deletion and 2 bp insertion

  and truth could have:

      A/AGT => just the simplified 2 bp insertion

  And this routine will correctly equate the AGT/AGTGT allele in candidate
  with the A/AGT in truth and use the number of copies of AGT in truth to
  compute the number of copies of AGTGT when determining the returned genotype.

  Args:
    candidate_variant: Our candidate third_party.nucleus.protos.Variant variant.
    truth_variant: Our third_party.nucleus.protos.Variant truth variant
      containing true alleles and genotypes.

  Returns:
    A tuple genotypes with the same semantics at the genotype field of the
    VariantCall proto.

  Raises:
    ValueError: If candidate_variant is None, truth_variant is None, or
      truth_variant doesn't have genotypes.
  """
    if candidate_variant is None:
        raise ValueError('candidate_variant cannot be None')
    if truth_variant is None:
        raise ValueError('truth_variant cannot be None')
    if not variantcall_utils.has_genotypes(
            variant_utils.only_call(truth_variant)):
        raise ValueError(
            'truth_variant needs genotypes to be used for labeling',
            truth_variant)

    def _match_one_allele(true_allele):
        if true_allele == truth_variant.reference_bases:
            return 0
        else:
            simplified_true_allele = variant_utils.simplify_alleles(
                truth_variant.reference_bases, true_allele)
            for alt_index, alt_allele in enumerate(
                    candidate_variant.alternate_bases):
                simplified_alt_allele = variant_utils.simplify_alleles(
                    candidate_variant.reference_bases, alt_allele)
                if simplified_true_allele == simplified_alt_allele:
                    return alt_index + 1
            # If nothing matched, we don't have this alt, so the alt allele index for
            # should be 0 (i.e., not any alt).
            return 0

    # If our candidate_variant is a reference call, return a (0, 0) genotype.
    if variant_utils.is_ref(candidate_variant):
        return (0, 0)
    else:
        return tuple(
            sorted(
                _match_one_allele(true_allele) for true_allele in
                variant_utils.genotype_as_alleles(truth_variant)))
Пример #7
0
def _genotype_from_matched_truth(candidate_variant, truth_variant):
  """Gets the diploid genotype for candidate_variant from matched truth_variant.

  This method figures out the genotype for candidate_variant by matching alleles
  in candidate_variant with those used by the genotype assigned to
  truth_variant. For example, if candidate is A/C and truth is A/C with a 0/1
  genotype, then this function would return (0, 1) indicating that there's one
  copy of the A allele and one of C in truth. If the true genotype is 1/1, then
  this routine would return (1, 1).

  The routine allows candidate_variant and truth_variant to differ in both
  the number of alternate alleles, and even in the representation of the same
  alleles due to those differences. For example, candidate could be:

      AGT/A/AGTGT => 2 bp deletion and 2 bp insertion

  and truth could have:

      A/AGT => just the simplified 2 bp insertion

  And this routine will correctly equate the AGT/AGTGT allele in candidate
  with the A/AGT in truth and use the number of copies of AGT in truth to
  compute the number of copies of AGTGT when determining the returned genotype.

  Args:
    candidate_variant: Our candidate third_party.nucleus.protos.Variant variant.
    truth_variant: Our third_party.nucleus.protos.Variant truth variant
      containing true alleles and genotypes.

  Returns:
    A tuple genotypes with the same semantics at the genotype field of the
    VariantCall proto.

  Raises:
    ValueError: If candidate_variant is None, truth_variant is None, or
      truth_variant doesn't have genotypes.
  """
  if candidate_variant is None:
    raise ValueError('candidate_variant cannot be None')
  if truth_variant is None:
    raise ValueError('truth_variant cannot be None')
  if not variantcall_utils.has_genotypes(
      variant_utils.only_call(truth_variant)):
    raise ValueError('truth_variant needs genotypes to be used for labeling',
                     truth_variant)

  def _match_one_allele(true_allele):
    if true_allele == truth_variant.reference_bases:
      return 0
    else:
      simplifed_true_allele = variant_utils.simplify_alleles(
          truth_variant.reference_bases, true_allele)
      for alt_index, alt_allele in enumerate(candidate_variant.alternate_bases):
        simplifed_alt_allele = variant_utils.simplify_alleles(
            candidate_variant.reference_bases, alt_allele)
        if simplifed_true_allele == simplifed_alt_allele:
          return alt_index + 1
      # If nothing matched, we don't have this alt, so the alt allele index for
      # should be 0 (i.e., not any alt).
      return 0

  # If our candidate_variant is a reference call, return a (0, 0) genotype.
  if variant_utils.is_ref(candidate_variant):
    return (0, 0)
  else:
    return tuple(
        sorted(
            _match_one_allele(true_allele) for true_allele in
            variant_utils.genotype_as_alleles(truth_variant)))