예제 #1
0
def examples_to_variants(examples_path, max_records=None):
  """Yields Variant protos from the examples in examples_path.

  This function reads in tf.Examples produced by DeepVariant from examples_path,
  which may contain a sharded spec, sorts them, selects a representive example
  when there are multiple versions representing different alt_alleles, and
  yields the example_variant field from those examples.

  Args:
    examples_path: str. Path, or sharded spec, to labeled tf.Examples produced
      by DeepVariant in training mode.
    max_records: int or None. Maximum number of records to read, or None, to
      read all of the records.

  Yields:
    nucleus.protos.Variant protos in coordinate-sorted order.

  Raises:
    ValueError: if we find a Variant in any example that doesn't have genotypes.
  """
  examples = io_utils.read_tfrecords(examples_path, max_records=max_records)
  variants = sorted(
      (tf_utils.example_variant(example) for example in examples),
      key=variant_utils.variant_range_tuple)

  for _, group in itertools.groupby(variants,
                                    variant_utils.variant_range_tuple):
    variant = next(group)
    if not variantcall_utils.has_genotypes(variant_utils.only_call(variant)):
      raise ValueError((
          'Variant {} does not have any genotypes. This tool only works with '
          'variants that have been labeled.').format(
              variant_utils.variant_key(variant)))
    yield variant
예제 #2
0
def examples_to_variants(examples_path, max_records=None):
    """Yields Variant protos from the examples in examples_path.

  This function reads in tf.Examples produced by DeepVariant from examples_path,
  which may contain a sharded spec, sorts them, selects a representive example
  when there are multiple versions representing different alt_alleles, and
  yields the example_variant field from those examples.

  Args:
    examples_path: str. Path, or sharded spec, to labeled tf.Examples produced
      by DeepVariant in training mode.
    max_records: int or None. Maximum number of records to read, or None, to
      read all of the records.

  Yields:
    nucleus.protos.Variant protos in coordinate-sorted order.

  Raises:
    ValueError: if we find a Variant in any example that doesn't have genotypes.
  """
    examples = io_utils.read_tfrecords(examples_path, max_records=max_records)
    variants = sorted(
        (tf_utils.example_variant(example) for example in examples),
        key=variant_utils.variant_range_tuple)

    for _, group in itertools.groupby(variants,
                                      variant_utils.variant_range_tuple):
        variant = next(group)
        if not variantcall_utils.has_genotypes(
                variant_utils.only_call(variant)):
            raise ValueError((
                'Variant {} does not have any genotypes. This tool only works with '
                'variants that have been labeled.').format(
                    variant_utils.variant_key(variant)))
        yield variant
  def test_vcf_caller_end2end_outputs(self):
    # Confirming that the proposed VCF (input) has the same variants
    # as the VCF output converted from the output of make_examples.
    variants = list(
        labeled_examples_to_vcf.examples_to_variants(
            testdata.GOLDEN_VCF_CALLER_TRAINING_EXAMPLES))
    with vcf.VcfReader(testdata.TRUTH_VARIANTS_VCF) as proposed_vcf_reader:
      # This checks the keys (like chr20:10099832:A->G) are the same.
      self.assertEqual([variant_utils.variant_key(v1) for v1 in variants], [
          variant_utils.variant_key(v2) for v2 in proposed_vcf_reader.iterate()
      ])

    with vcf.VcfReader(testdata.TRUTH_VARIANTS_VCF) as proposed_vcf_reader:
      self.assertEqual(
          [variant_utils.genotype_as_alleles(v1) for v1 in variants], [
              variant_utils.genotype_as_alleles(
                  variant_utils.unphase_all_genotypes(v2))
              for v2 in proposed_vcf_reader.iterate()
          ])
def main(argv):
  del argv

  contigs = fasta.RefFastaReader(FLAGS.ref).header.contigs
  max_records = FLAGS.max_records if FLAGS.max_records >= 0 else None
  variants_iter = examples_to_variants(FLAGS.examples, max_records=max_records)

  if not FLAGS.sample_name:
    sample_name, variants_iter = peek_sample_name(variants_iter)
  else:
    sample_name = FLAGS.sample_name
  header = dv_vcf_constants.deepvariant_header(
      contigs=contigs, sample_names=[sample_name])
  with vcf.VcfWriter(FLAGS.output_vcf, header=header) as writer:
    for variant in variants_iter:
      variant.calls[0].call_set_name = sample_name
      logging.log_every_n(logging.INFO, 'Converted %s', FLAGS.log_every,
                          variant_utils.variant_key(variant))
      writer.write(variant)
예제 #5
0
 def test_variant_key(self, variant, expected_key, sort_alleles=True):
   self.assertEqual(
       variant_utils.variant_key(variant, sort_alleles=sort_alleles),
       expected_key)
예제 #6
0
 def test_variant_key(self, variant, expected_key, sort_alleles=True):
   self.assertEqual(
       variant_utils.variant_key(variant, sort_alleles=sort_alleles),
       expected_key)
예제 #7
0
def print_variants(name, variants):
    logging.info('variants: %s [%d]', name, len(variants))
    for v in variants:
        logging.info('  %s gt=%s', variant_utils.variant_key(v),
                     _variant_genotypes([v])[0])
def _log_variants(name, variants):
    """Write basic information about variants to logging.info."""
    logging.info('variants: %s [%d]', name, len(variants))
    for v in variants:
        logging.info('  %s gt=%s', variant_utils.variant_key(v),
                     _variant_genotypes([v])[0])
예제 #9
0
def print_variants(name, variants):
  logging.info('variants: %s [%d]', name, len(variants))
  for v in variants:
    logging.info('  %s gt=%s', variant_utils.variant_key(v),
                 _variant_genotypes([v])[0])