Пример #1
0
  def test_round_trip_vcf(self, test_datum_name):
    # Round-trip variants through writing and reading:
    # 1. Read variants v1 from VcfReader;
    # 2. Write v1 to vcf using our VcfWriter;
    # 3. Read back in using VcfReader -- v2;
    # 4. compare v1 and v2.
    in_file = test_utils.genomics_core_testdata(test_datum_name)
    out_file = test_utils.test_tmpfile('output_' + test_datum_name)

    v1_reader = vcf.VcfReader(in_file)
    v1_records = list(v1_reader.iterate())
    self.assertTrue(v1_records, 'Reader failed to find records')

    header = copy.deepcopy(v1_reader.header)
    writer_options = variants_pb2.VcfWriterOptions()

    with vcf_writer.VcfWriter.to_file(out_file, header,
                                      writer_options) as writer:
      for record in v1_records:
        writer.write(record)

    v2_reader = vcf.VcfReader(out_file)
    v2_records = list(v2_reader.iterate())

    self.assertEqual(v1_records, v2_records,
                     'Round-tripped variants not as expected')
Пример #2
0
    def setUp(self):
        self.sites_reader = vcf.VcfReader(
            test_utils.genomics_core_testdata('test_sites.vcf'),
            use_index=False)

        self.samples_reader = vcf.VcfReader(
            test_utils.genomics_core_testdata('test_samples.vcf.gz'),
            use_index=True)
Пример #3
0
    def test_vcf_query(self):
        tabix.build_index(self.output_file)
        self.input_reader = vcf.VcfReader(self.input_file)
        self.output_reader = vcf.VcfReader(self.output_file)

        range1 = ranges.parse_literal('chr3:100,000-500,000')
        self.assertEqual(list(self.input_reader.query(range1)),
                         list(self.output_reader.query(range1)))
Пример #4
0
  def test_headerless_vcf(self):
    """Writes a headerless vcf and reads it back out."""
    test_vcf = test_utils.genomics_core_testdata('test_sites.vcf')
    output_vcf = test_utils.test_tmpfile('output.vcf')
    expected_variants = []
    with vcf.VcfReader(test_vcf) as reader:
      with vcf.VcfWriter(
          output_vcf, header=reader.header, exclude_header=True) as writer:
        for record in reader:
          expected_variants.append(record)
          writer.write(record)

      with vcf.VcfReader(output_vcf, header=reader.header) as actual_reader:
        self.assertEqual(expected_variants, list(actual_reader))
Пример #5
0
    def test_roundtrip(self,
                       expected_infos,
                       expected_fmt,
                       expected_fmt1,
                       expected_fmt2,
                       reader_excluded_info=None,
                       reader_excluded_format=None,
                       writer_excluded_info=None,
                       writer_excluded_format=None):
        expected_records = [
            record.format(info=info, fmt=expected_fmt, efmts1=e1,
                          efmts2=e2) for record, info, e1, e2 in zip(
                              self.record_format_strings, expected_infos,
                              expected_fmt1, expected_fmt2)
        ]
        expected = self.header + ''.join(expected_records)
        with vcf.VcfReader(
                test_utils.genomics_core_testdata('test_py_roundtrip.vcf'),
                excluded_info_fields=reader_excluded_info,
                excluded_format_fields=reader_excluded_format) as reader:

            records = list(reader.iterate())
            output_path = test_utils.test_tmpfile('test_roundtrip_tmpfile.vcf')
            with vcf.VcfWriter(
                    output_path,
                    header=reader.header,
                    excluded_info_fields=writer_excluded_info,
                    excluded_format_fields=writer_excluded_format) as writer:
                for record in records:
                    writer.write(record)

        with open(output_path) as f:
            actual = f.read()
        self.assertEqual(actual, expected)
Пример #6
0
  def test_c_reader(self):
    self.assertNotEqual(self.sites_reader.c_reader, 0)
    self.assertNotEqual(self.samples_reader.c_reader, 0)

    tfrecord_reader = vcf.VcfReader(
        test_utils.genomics_core_testdata('test_samples.vcf.golden.tfrecord'))
    self.assertNotEqual(tfrecord_reader.c_reader, 0)
Пример #7
0
def make_ngs_error_examples(ref_path, vcf_path, bam_path):
  """ Yields tf.Example for training a ML model.

  Each tf.Example contains
  relevant features aboout the ngs read.

  Args:
    ref_path: str. A path to an indexed fasta file.
    vcf_path: str. A path to an indexed VCF file.
    bam_path: str. A path to an SAM/BAM file.

  Yields:
    A tuple (example, ngs_read_length, has_error), where example is a
    tf.Example, ngs_read_length is the length of the read generated by the
    sequencer, and has_error is a boolean specifying whether the example
    contains a read error.
  """

  # Create a ref_reader backed by ref.
  ref_reader = fasta.IndexedFastaReader(ref_path)

  # Create a vcf_reader backed by vcf.
  vcf_reader = vcf.VcfReader(vcf_path)

  # Create a sam_reader backed by bam. Provide an empty ReadRequirements
  # proto to the reader so it enables standard filtering based on the default
  # values of ReadRequirements. Also explicitly allow the reader to access an
  # unindexed BAM, so only the iterate() function is enabled.
  read_requirements = reads_pb2.ReadRequirements()
  sam_reader = sam.SamReader(bam_path, read_requirements=read_requirements)

  # All our readers and writers are context managers, so use the `with`
  # construct to open all of the inputs/outputs and close them when we are done
  # looping over our reads.
  with ref_reader, vcf_reader, sam_reader:
    # Loop over the reads in our BAM file:
    for read in sam_reader.iterate():
      # Get the Range proto describing the chrom/start/stop spanned by our read.
      assert len(read.alignment.cigar) > 0
      first_cigar = read.alignment.cigar[0]
      # If the first cigar is a CLIP_SOFT, the start of sequence is the cigar
      # operation length before the alignment position.
      start = read.alignment.position.position
      if first_cigar.operation == cigar_pb2.CigarUnit.CLIP_SOFT:
        start -= first_cigar.operation_length
      read_range = ranges.make_range(read.alignment.position.reference_name,
                                     start, start + len(read.aligned_sequence))

      # Get all of the variants that overlap our read range.
      variants = list(vcf_reader.query(read_range))

      # Get the reference bases spanned by our read.
      ref_bases = ref_reader.query(read_range)

      # Check that we can use our read for generating an example.
      if is_usable_training_example(read, variants, ref_bases):
        # Convert read and ref_bases to a tf.Example with make_example.
        yield make_example(read, ref_bases), len(read.aligned_sequence), (
            read.aligned_sequence != ref_bases)
Пример #8
0
    def test_main(self):
        in_fname = test_utils.genomics_core_testdata('test_vaf.vcf')
        out_fname = test_utils.test_tmpfile('output.vcf')
        filter_vcf.main(['filter_vcf', in_fname, out_fname])

        with vcf.VcfReader(out_fname) as reader:
            variants = list(reader)
            self.assertEqual(3, len(variants))
            self.assertEqual(['DogSNP4', 'DogSNP5', 'DogSNP6'],
                             [v.names[0] for v in variants])
            for v in variants:
                self.assertTrue(v.quality > 3.01)
Пример #9
0
def main(argv):
    del argv

    priors = map(float, FLAGS.genotype_priors)
    sump = sum(priors)
    log_priors = [math.log10(x / sump) for x in priors]

    with vcf.VcfReader(FLAGS.input_vcf) as reader:
        with vcf.VcfWriter(FLAGS.output_vcf, header=reader.header) as writer:
            for variant in reader:
                recall_variant(log_priors, variant)
                # TODO(thomaswc): Also update the variant's quality.
                writer.write(variant)
Пример #10
0
def main(argv):
    if len(argv) != 3:
        print('Usage: {} <input_vcf> <output_vcf>'.format(argv[0]))
        sys.exit(-1)
    in_vcf = argv[1]
    out_vcf = argv[2]

    # Please try to keep the following part in sync with the documenation in
    # g3doc/overview.md.
    with vcf.VcfReader(in_vcf, use_index=False) as reader:
        print('Sample names in VCF: ', ' '.join(reader.header.sample_names))
        with vcf.VcfWriter(out_vcf, header=reader.header) as writer:
            for variant in reader:
                if variant.quality > 3.01:
                    writer.write(variant)
Пример #11
0
def main(argv):
  if len(argv) != 3:
    print('Usage: {} <input_ref> <input_vcf>'.format(argv[0]))
    sys.exit(-1)
  in_ref = argv[1]
  in_vcf = argv[2]

  with fasta.RefFastaReader(in_ref) as ref_reader:
    with vcf.VcfReader(in_vcf, use_index=False) as vcf_reader:
      validate_contigs(ref_reader.header.contigs, vcf_reader.header.contigs)
      for variant in vcf_reader:
        validate_variant(ref_reader, variant)

  # VCF is valid!
  print('Reference and VCF are compatible.')
  sys.exit(0)
Пример #12
0
  def test_main(self):
    in_fname = test_utils.genomics_core_testdata('test_allele_depth.vcf')
    out_fname = test_utils.test_tmpfile('output.vcf')
    add_ad_to_vcf.main(['add_ad_to_vcf', in_fname, out_fname])

    with vcf.VcfReader(out_fname, use_index=False) as reader:
      info_ids = [info.id for info in reader.header.infos]
      self.assertTrue('AD' in info_ids)
      variant1 = next(reader)
      self.assertEqual([3, 3], variant_utils.get_info(variant1, 'AD', reader))
      variant2 = next(reader)
      self.assertEqual([30, 44], variant_utils.get_info(variant2, 'AD', reader))
      variant3 = next(reader)
      self.assertEqual([15, 4], variant_utils.get_info(variant3, 'AD', reader))
      variant4 = next(reader)
      self.assertEqual([2, 4], variant_utils.get_info(variant4, 'AD', reader))
      variant5 = next(reader)
      self.assertEqual([24, 2], variant_utils.get_info(variant5, 'AD', reader))
Пример #13
0
def main(argv):
    if len(argv) != 3:
        print('Usage: %s <input_vcf> <output_vcf>' % argv[0])
        sys.exit(-1)
    in_vcf = argv[1]
    out_vcf = argv[2]

    with vcf.VcfReader(in_vcf) as reader:
        if 'AD' in [info.id for info in reader.header.infos]:
            print('%s already contains AD field.' % in_vcf)
            sys.exit(-1)
        out_header = reader.header
        out_header.infos.extend([vcf_constants.reserved_info_field('AD')])

        with vcf.VcfWriter(out_vcf, header=out_header) as writer:
            for variant in reader:
                variant_utils.set_info(variant, 'AD', get_variant_ad(variant),
                                       writer)
                writer.write(variant)
def make_ngs_examples(hparams):
    """Generator function that yields training, evaluation and test examples."""
    ref_reader = fasta.IndexedFastaReader(input_path=hparams.ref_path)
    vcf_reader = vcf.VcfReader(input_path=hparams.vcf_path)
    read_requirements = reads_pb2.ReadRequirements()
    sam_reader = sam.SamReader(input_path=hparams.bam_path,
                               read_requirements=read_requirements)

    # Use a separate SAM reader to query for reads falling in the pileup range.
    sam_query_reader = sam.SamReader(input_path=hparams.bam_path,
                                     read_requirements=read_requirements)
    used_pileup_ranges = set()
    with ref_reader, vcf_reader, sam_reader, sam_query_reader:
        for read in sam_reader:

            # Check that read has cigar string present and allowed alignment.
            if not read.alignment.cigar:
                print('Skipping read, no cigar alignment found')
                continue
            if not has_allowed_alignment(read):
                continue

            # Obtain window that will be used to construct an example.
            read_range = utils.read_range(read)
            ref = ref_reader.query(region=read_range)
            pileup_range = get_pileup_range(hparams, read, read_range, ref)

            # Do not construct multiple examples with the same pileup range.
            pileup_range_serialized = pileup_range.SerializeToString()
            if pileup_range_serialized in used_pileup_ranges:
                continue
            used_pileup_ranges.add(pileup_range_serialized)

            # Get reference sequence, reads, and truth variants for the pileup range.
            pileup_reads = list(sam_query_reader.query(region=pileup_range))
            pileup_ref = ref_reader.query(region=pileup_range)
            pileup_variants = list(vcf_reader.query(region=pileup_range))
            if is_usable_example(pileup_reads, pileup_variants, pileup_ref):
                yield make_example(hparams, pileup_reads, pileup_ref,
                                   pileup_range)
Пример #15
0
def main(argv):
  if len(argv) != 2:
    print('Usage: {} <input_vcf>'.format(argv[0]))
    sys.exit(-1)
  in_vcf = argv[1]

  total = 0
  by_type = collections.defaultdict(int)
  by_ref = collections.defaultdict(int)

  with vcf.VcfReader(in_vcf) as reader:
    for variant in reader:
      total += 1
      by_type[variant_utils.variant_type(variant)] += 1
      by_ref[variant.reference_name] += 1

  print('# variants: {}'.format(total))
  print('# ref variants: {}'.format(by_type[variant_utils.VariantType.ref]))
  print('# SNP variants: {}'.format(by_type[variant_utils.VariantType.snp]))
  print('# indel variants: {}'.format(by_type[variant_utils.VariantType.indel]))
  for k, v in sorted(by_ref.items()):
    print('# variants in {}: {}'.format(k, v))
Пример #16
0
  def test_header_format_mixed_order(self):
    """Tests reading a VCF with unconventional FORMAT field definition.

    Tests reading a VCF in which the properties of the format
    fields are defined in mixed order in the header. For example,

    ##FORMAT=<ID=GT,Type=String,Number=1,Description="GT description">

    (In normal VCFs "Number" should come before "Type".)
    """
    with vcf.VcfReader(
        test_utils.genomics_core_testdata(
            'header_format_mixed_order.vcf')) as vreader:
      formats = vreader.header.formats
      variants = list(vreader)
    self.assertLen(formats, 1)
    self.assertEqual(formats[0].id, 'GT')
    self.assertEqual(formats[0].number, '1')
    self.assertEqual(formats[0].type, 'String')
    self.assertEqual(formats[0].description, 'GT description')
    self.assertLen(variants, 2)
    self.assertEqual(variants[0].calls[0].genotype, [0, 1])
    self.assertEqual(variants[1].calls[0].genotype, [1, 1])
Пример #17
0
  def setUp(self):
    self.sites_reader = vcf.VcfReader(
        test_utils.genomics_core_testdata('test_sites.vcf'))

    self.samples_reader = vcf.VcfReader(
        test_utils.genomics_core_testdata('test_samples.vcf.gz'))
Пример #18
0
 def setUp(self):
   self.vcf_reader = vcf.VcfReader(
       test_utils.genomics_core_testdata('test_sites.vcf'))
   self.cache = self.vcf_reader.field_access_cache
Пример #19
0
def make_ngs_error_examples(ref_path,
                            vcf_path,
                            bam_path,
                            examples_out_path,
                            max_reads=None):
    """Driver program for ngs_errors.

  See module description for details.

  Args:
    ref_path: str. A path to an indexed fasta file.
    vcf_path: str. A path to an indexed VCF file.
    bam_path: str. A path to an SAM/BAM file.
    examples_out_path: str. A path where we will write out examples.
    max_reads: int or None. If not None, we will emit at most max_reads examples
      to examples_out_path.
  """

    # Create a ref_reader backed by ref.
    ref_reader = fasta.IndexedFastaReader(ref_path)

    # Create a vcf_reader backed by vcf.
    vcf_reader = vcf.VcfReader(vcf_path)

    # Create a sam_reader backed by bam. Provide an empty ReadRequirements
    # proto to the reader so it enables standard filtering based on the default
    # values of ReadRequirements. Also explicitly allow the reader to access an
    # unindexed BAM, so only the iterate() function is enabled.
    read_requirements = reads_pb2.ReadRequirements()
    sam_reader = sam.SamReader(bam_path, read_requirements=read_requirements)

    # Create our TFRecordWriter where we'll send our tf.Examples.
    examples_out = genomics_writer.TFRecordWriter(examples_out_path)

    # All our readers and writers are context managers, so use the `with`
    # construct to open all of the inputs/outputs and close them when we are done
    # looping over our reads.
    n_examples = 0
    with ref_reader, vcf_reader, sam_reader, examples_out:
        # Loop over the reads in our BAM file:
        for i, read in enumerate(sam_reader.iterate(), start=1):
            # Get the Range proto describing the chrom/start/stop spanned by our read.
            read_range = utils.read_range(read)

            # Get all of the variants that overlap our read range.
            variants = list(vcf_reader.query(read_range))

            # Get the reference bases spanned by our read.
            ref_bases = ref_reader.query(read_range)

            # Check that we can use our read for generating an example.
            if is_usable_training_example(read, variants, ref_bases):
                n_examples += 1

                # Convert read and ref_bases to a tf.Example with make_example.
                example = make_example(read, ref_bases)

                # And write it out to our TFRecord output file.
                examples_out.write(example)

                # Do a bit of convenient logging. This is very verbose if we convert a
                # lot of reads...
                logging.info((
                    'Added an example for read %s (span=%s) with cigar %s [%d added '
                    'of %d total reads]'), read.fragment_name,
                             ranges.to_literal(read_range),
                             cigar.format_cigar_units(read.alignment.cigar),
                             n_examples, i)

                if max_reads is not None and n_examples >= max_reads:
                    return
Пример #20
0
from nucleus.io import vcf
r = vcf.VcfReader('NA12878_calls.vcf.gz')
#r = vcf.VcfReader('test_samples.vcf')
#for v in r:
#  print(v.start)