def test_round_trip_vcf(self, test_datum_name): # Round-trip variants through writing and reading: # 1. Read variants v1 from VcfReader; # 2. Write v1 to vcf using our VcfWriter; # 3. Read back in using VcfReader -- v2; # 4. compare v1 and v2. in_file = test_utils.genomics_core_testdata(test_datum_name) out_file = test_utils.test_tmpfile('output_' + test_datum_name) v1_reader = vcf.VcfReader(in_file) v1_records = list(v1_reader.iterate()) self.assertTrue(v1_records, 'Reader failed to find records') header = copy.deepcopy(v1_reader.header) writer_options = variants_pb2.VcfWriterOptions() with vcf_writer.VcfWriter.to_file(out_file, header, writer_options) as writer: for record in v1_records: writer.write(record) v2_reader = vcf.VcfReader(out_file) v2_records = list(v2_reader.iterate()) self.assertEqual(v1_records, v2_records, 'Round-tripped variants not as expected')
def setUp(self): self.sites_reader = vcf.VcfReader( test_utils.genomics_core_testdata('test_sites.vcf'), use_index=False) self.samples_reader = vcf.VcfReader( test_utils.genomics_core_testdata('test_samples.vcf.gz'), use_index=True)
def test_vcf_query(self): tabix.build_index(self.output_file) self.input_reader = vcf.VcfReader(self.input_file) self.output_reader = vcf.VcfReader(self.output_file) range1 = ranges.parse_literal('chr3:100,000-500,000') self.assertEqual(list(self.input_reader.query(range1)), list(self.output_reader.query(range1)))
def test_headerless_vcf(self): """Writes a headerless vcf and reads it back out.""" test_vcf = test_utils.genomics_core_testdata('test_sites.vcf') output_vcf = test_utils.test_tmpfile('output.vcf') expected_variants = [] with vcf.VcfReader(test_vcf) as reader: with vcf.VcfWriter( output_vcf, header=reader.header, exclude_header=True) as writer: for record in reader: expected_variants.append(record) writer.write(record) with vcf.VcfReader(output_vcf, header=reader.header) as actual_reader: self.assertEqual(expected_variants, list(actual_reader))
def test_roundtrip(self, expected_infos, expected_fmt, expected_fmt1, expected_fmt2, reader_excluded_info=None, reader_excluded_format=None, writer_excluded_info=None, writer_excluded_format=None): expected_records = [ record.format(info=info, fmt=expected_fmt, efmts1=e1, efmts2=e2) for record, info, e1, e2 in zip( self.record_format_strings, expected_infos, expected_fmt1, expected_fmt2) ] expected = self.header + ''.join(expected_records) with vcf.VcfReader( test_utils.genomics_core_testdata('test_py_roundtrip.vcf'), excluded_info_fields=reader_excluded_info, excluded_format_fields=reader_excluded_format) as reader: records = list(reader.iterate()) output_path = test_utils.test_tmpfile('test_roundtrip_tmpfile.vcf') with vcf.VcfWriter( output_path, header=reader.header, excluded_info_fields=writer_excluded_info, excluded_format_fields=writer_excluded_format) as writer: for record in records: writer.write(record) with open(output_path) as f: actual = f.read() self.assertEqual(actual, expected)
def test_c_reader(self): self.assertNotEqual(self.sites_reader.c_reader, 0) self.assertNotEqual(self.samples_reader.c_reader, 0) tfrecord_reader = vcf.VcfReader( test_utils.genomics_core_testdata('test_samples.vcf.golden.tfrecord')) self.assertNotEqual(tfrecord_reader.c_reader, 0)
def make_ngs_error_examples(ref_path, vcf_path, bam_path): """ Yields tf.Example for training a ML model. Each tf.Example contains relevant features aboout the ngs read. Args: ref_path: str. A path to an indexed fasta file. vcf_path: str. A path to an indexed VCF file. bam_path: str. A path to an SAM/BAM file. Yields: A tuple (example, ngs_read_length, has_error), where example is a tf.Example, ngs_read_length is the length of the read generated by the sequencer, and has_error is a boolean specifying whether the example contains a read error. """ # Create a ref_reader backed by ref. ref_reader = fasta.IndexedFastaReader(ref_path) # Create a vcf_reader backed by vcf. vcf_reader = vcf.VcfReader(vcf_path) # Create a sam_reader backed by bam. Provide an empty ReadRequirements # proto to the reader so it enables standard filtering based on the default # values of ReadRequirements. Also explicitly allow the reader to access an # unindexed BAM, so only the iterate() function is enabled. read_requirements = reads_pb2.ReadRequirements() sam_reader = sam.SamReader(bam_path, read_requirements=read_requirements) # All our readers and writers are context managers, so use the `with` # construct to open all of the inputs/outputs and close them when we are done # looping over our reads. with ref_reader, vcf_reader, sam_reader: # Loop over the reads in our BAM file: for read in sam_reader.iterate(): # Get the Range proto describing the chrom/start/stop spanned by our read. assert len(read.alignment.cigar) > 0 first_cigar = read.alignment.cigar[0] # If the first cigar is a CLIP_SOFT, the start of sequence is the cigar # operation length before the alignment position. start = read.alignment.position.position if first_cigar.operation == cigar_pb2.CigarUnit.CLIP_SOFT: start -= first_cigar.operation_length read_range = ranges.make_range(read.alignment.position.reference_name, start, start + len(read.aligned_sequence)) # Get all of the variants that overlap our read range. variants = list(vcf_reader.query(read_range)) # Get the reference bases spanned by our read. ref_bases = ref_reader.query(read_range) # Check that we can use our read for generating an example. if is_usable_training_example(read, variants, ref_bases): # Convert read and ref_bases to a tf.Example with make_example. yield make_example(read, ref_bases), len(read.aligned_sequence), ( read.aligned_sequence != ref_bases)
def test_main(self): in_fname = test_utils.genomics_core_testdata('test_vaf.vcf') out_fname = test_utils.test_tmpfile('output.vcf') filter_vcf.main(['filter_vcf', in_fname, out_fname]) with vcf.VcfReader(out_fname) as reader: variants = list(reader) self.assertEqual(3, len(variants)) self.assertEqual(['DogSNP4', 'DogSNP5', 'DogSNP6'], [v.names[0] for v in variants]) for v in variants: self.assertTrue(v.quality > 3.01)
def main(argv): del argv priors = map(float, FLAGS.genotype_priors) sump = sum(priors) log_priors = [math.log10(x / sump) for x in priors] with vcf.VcfReader(FLAGS.input_vcf) as reader: with vcf.VcfWriter(FLAGS.output_vcf, header=reader.header) as writer: for variant in reader: recall_variant(log_priors, variant) # TODO(thomaswc): Also update the variant's quality. writer.write(variant)
def main(argv): if len(argv) != 3: print('Usage: {} <input_vcf> <output_vcf>'.format(argv[0])) sys.exit(-1) in_vcf = argv[1] out_vcf = argv[2] # Please try to keep the following part in sync with the documenation in # g3doc/overview.md. with vcf.VcfReader(in_vcf, use_index=False) as reader: print('Sample names in VCF: ', ' '.join(reader.header.sample_names)) with vcf.VcfWriter(out_vcf, header=reader.header) as writer: for variant in reader: if variant.quality > 3.01: writer.write(variant)
def main(argv): if len(argv) != 3: print('Usage: {} <input_ref> <input_vcf>'.format(argv[0])) sys.exit(-1) in_ref = argv[1] in_vcf = argv[2] with fasta.RefFastaReader(in_ref) as ref_reader: with vcf.VcfReader(in_vcf, use_index=False) as vcf_reader: validate_contigs(ref_reader.header.contigs, vcf_reader.header.contigs) for variant in vcf_reader: validate_variant(ref_reader, variant) # VCF is valid! print('Reference and VCF are compatible.') sys.exit(0)
def test_main(self): in_fname = test_utils.genomics_core_testdata('test_allele_depth.vcf') out_fname = test_utils.test_tmpfile('output.vcf') add_ad_to_vcf.main(['add_ad_to_vcf', in_fname, out_fname]) with vcf.VcfReader(out_fname, use_index=False) as reader: info_ids = [info.id for info in reader.header.infos] self.assertTrue('AD' in info_ids) variant1 = next(reader) self.assertEqual([3, 3], variant_utils.get_info(variant1, 'AD', reader)) variant2 = next(reader) self.assertEqual([30, 44], variant_utils.get_info(variant2, 'AD', reader)) variant3 = next(reader) self.assertEqual([15, 4], variant_utils.get_info(variant3, 'AD', reader)) variant4 = next(reader) self.assertEqual([2, 4], variant_utils.get_info(variant4, 'AD', reader)) variant5 = next(reader) self.assertEqual([24, 2], variant_utils.get_info(variant5, 'AD', reader))
def main(argv): if len(argv) != 3: print('Usage: %s <input_vcf> <output_vcf>' % argv[0]) sys.exit(-1) in_vcf = argv[1] out_vcf = argv[2] with vcf.VcfReader(in_vcf) as reader: if 'AD' in [info.id for info in reader.header.infos]: print('%s already contains AD field.' % in_vcf) sys.exit(-1) out_header = reader.header out_header.infos.extend([vcf_constants.reserved_info_field('AD')]) with vcf.VcfWriter(out_vcf, header=out_header) as writer: for variant in reader: variant_utils.set_info(variant, 'AD', get_variant_ad(variant), writer) writer.write(variant)
def make_ngs_examples(hparams): """Generator function that yields training, evaluation and test examples.""" ref_reader = fasta.IndexedFastaReader(input_path=hparams.ref_path) vcf_reader = vcf.VcfReader(input_path=hparams.vcf_path) read_requirements = reads_pb2.ReadRequirements() sam_reader = sam.SamReader(input_path=hparams.bam_path, read_requirements=read_requirements) # Use a separate SAM reader to query for reads falling in the pileup range. sam_query_reader = sam.SamReader(input_path=hparams.bam_path, read_requirements=read_requirements) used_pileup_ranges = set() with ref_reader, vcf_reader, sam_reader, sam_query_reader: for read in sam_reader: # Check that read has cigar string present and allowed alignment. if not read.alignment.cigar: print('Skipping read, no cigar alignment found') continue if not has_allowed_alignment(read): continue # Obtain window that will be used to construct an example. read_range = utils.read_range(read) ref = ref_reader.query(region=read_range) pileup_range = get_pileup_range(hparams, read, read_range, ref) # Do not construct multiple examples with the same pileup range. pileup_range_serialized = pileup_range.SerializeToString() if pileup_range_serialized in used_pileup_ranges: continue used_pileup_ranges.add(pileup_range_serialized) # Get reference sequence, reads, and truth variants for the pileup range. pileup_reads = list(sam_query_reader.query(region=pileup_range)) pileup_ref = ref_reader.query(region=pileup_range) pileup_variants = list(vcf_reader.query(region=pileup_range)) if is_usable_example(pileup_reads, pileup_variants, pileup_ref): yield make_example(hparams, pileup_reads, pileup_ref, pileup_range)
def main(argv): if len(argv) != 2: print('Usage: {} <input_vcf>'.format(argv[0])) sys.exit(-1) in_vcf = argv[1] total = 0 by_type = collections.defaultdict(int) by_ref = collections.defaultdict(int) with vcf.VcfReader(in_vcf) as reader: for variant in reader: total += 1 by_type[variant_utils.variant_type(variant)] += 1 by_ref[variant.reference_name] += 1 print('# variants: {}'.format(total)) print('# ref variants: {}'.format(by_type[variant_utils.VariantType.ref])) print('# SNP variants: {}'.format(by_type[variant_utils.VariantType.snp])) print('# indel variants: {}'.format(by_type[variant_utils.VariantType.indel])) for k, v in sorted(by_ref.items()): print('# variants in {}: {}'.format(k, v))
def test_header_format_mixed_order(self): """Tests reading a VCF with unconventional FORMAT field definition. Tests reading a VCF in which the properties of the format fields are defined in mixed order in the header. For example, ##FORMAT=<ID=GT,Type=String,Number=1,Description="GT description"> (In normal VCFs "Number" should come before "Type".) """ with vcf.VcfReader( test_utils.genomics_core_testdata( 'header_format_mixed_order.vcf')) as vreader: formats = vreader.header.formats variants = list(vreader) self.assertLen(formats, 1) self.assertEqual(formats[0].id, 'GT') self.assertEqual(formats[0].number, '1') self.assertEqual(formats[0].type, 'String') self.assertEqual(formats[0].description, 'GT description') self.assertLen(variants, 2) self.assertEqual(variants[0].calls[0].genotype, [0, 1]) self.assertEqual(variants[1].calls[0].genotype, [1, 1])
def setUp(self): self.sites_reader = vcf.VcfReader( test_utils.genomics_core_testdata('test_sites.vcf')) self.samples_reader = vcf.VcfReader( test_utils.genomics_core_testdata('test_samples.vcf.gz'))
def setUp(self): self.vcf_reader = vcf.VcfReader( test_utils.genomics_core_testdata('test_sites.vcf')) self.cache = self.vcf_reader.field_access_cache
def make_ngs_error_examples(ref_path, vcf_path, bam_path, examples_out_path, max_reads=None): """Driver program for ngs_errors. See module description for details. Args: ref_path: str. A path to an indexed fasta file. vcf_path: str. A path to an indexed VCF file. bam_path: str. A path to an SAM/BAM file. examples_out_path: str. A path where we will write out examples. max_reads: int or None. If not None, we will emit at most max_reads examples to examples_out_path. """ # Create a ref_reader backed by ref. ref_reader = fasta.IndexedFastaReader(ref_path) # Create a vcf_reader backed by vcf. vcf_reader = vcf.VcfReader(vcf_path) # Create a sam_reader backed by bam. Provide an empty ReadRequirements # proto to the reader so it enables standard filtering based on the default # values of ReadRequirements. Also explicitly allow the reader to access an # unindexed BAM, so only the iterate() function is enabled. read_requirements = reads_pb2.ReadRequirements() sam_reader = sam.SamReader(bam_path, read_requirements=read_requirements) # Create our TFRecordWriter where we'll send our tf.Examples. examples_out = genomics_writer.TFRecordWriter(examples_out_path) # All our readers and writers are context managers, so use the `with` # construct to open all of the inputs/outputs and close them when we are done # looping over our reads. n_examples = 0 with ref_reader, vcf_reader, sam_reader, examples_out: # Loop over the reads in our BAM file: for i, read in enumerate(sam_reader.iterate(), start=1): # Get the Range proto describing the chrom/start/stop spanned by our read. read_range = utils.read_range(read) # Get all of the variants that overlap our read range. variants = list(vcf_reader.query(read_range)) # Get the reference bases spanned by our read. ref_bases = ref_reader.query(read_range) # Check that we can use our read for generating an example. if is_usable_training_example(read, variants, ref_bases): n_examples += 1 # Convert read and ref_bases to a tf.Example with make_example. example = make_example(read, ref_bases) # And write it out to our TFRecord output file. examples_out.write(example) # Do a bit of convenient logging. This is very verbose if we convert a # lot of reads... logging.info(( 'Added an example for read %s (span=%s) with cigar %s [%d added ' 'of %d total reads]'), read.fragment_name, ranges.to_literal(read_range), cigar.format_cigar_units(read.alignment.cigar), n_examples, i) if max_reads is not None and n_examples >= max_reads: return
from nucleus.io import vcf r = vcf.VcfReader('NA12878_calls.vcf.gz') #r = vcf.VcfReader('test_samples.vcf') #for v in r: # print(v.start)