def setUp(self): self.sites_reader = vcf.VcfReader( test_utils.genomics_core_testdata('test_sites.vcf'), use_index=False) self.samples_reader = vcf.VcfReader( test_utils.genomics_core_testdata('test_samples.vcf.gz'), use_index=True)
def test_wrap(self, fasta_filename): chr_names = ['chrM', 'chr1', 'chr2'] chr_lengths = [100, 76, 121] fasta = test_utils.genomics_core_testdata(fasta_filename) fai = test_utils.genomics_core_testdata(fasta_filename + '.fai') with reference_fai.GenomeReferenceFai.from_file(fasta, fai) as ref: self.assertEqual(ref.n_contigs, 3) self.assertIn(fasta, ref.fasta_path) self.assertIn('GenomeReference backed by htslib FAI index', str(ref)) self.assertEqual(ref.contig_names, chr_names) self.assertEqual(ref.n_bp, sum(chr_lengths)) self.assertEqual(ref.bases(ranges.make_range('chrM', 1, 10)), 'ATCACAGGT') self.assertTrue( ref.is_valid_interval(ranges.make_range('chrM', 1, 10))) self.assertFalse( ref.is_valid_interval(ranges.make_range('chrM', 1, 100000))) self.assertEqual(len(ref.contigs), 3) self.assertEqual([c.name for c in ref.contigs], chr_names) self.assertEqual([c.n_bases for c in ref.contigs], chr_lengths) for contig in ref.contigs: self.assertEqual(ref.contig(contig.name), contig) self.assertTrue(ref.has_contig(contig.name)) self.assertFalse(ref.has_contig(contig.name + '.unknown'))
def test_from_file_raises_with_missing_inputs(self, fasta_filename, fai_filename): fasta = test_utils.genomics_core_testdata(fasta_filename) fai = test_utils.genomics_core_testdata(fai_filename) with self.assertRaisesRegexp( ValueError, 'Not found: could not load fasta and/or fai for fasta ' + fasta): reference_fai.GenomeReferenceFai.from_file(fasta, fai)
def setUp(self): self.sites_vcf = test_utils.genomics_core_testdata('test_sites.vcf') self.samples_vcf = test_utils.genomics_core_testdata('test_samples.vcf.gz') self.options = variants_pb2.VcfReaderOptions() self.sites_reader = vcf_reader.VcfReader.from_file(self.sites_vcf, self.options) self.samples_reader = vcf_reader.VcfReader.from_file( self.samples_vcf, self.options)
def test_from_file_raises_with_missing_inputs(self, fasta_filename, fai_filename): fasta = test_utils.genomics_core_testdata(fasta_filename) fai = test_utils.genomics_core_testdata(fai_filename) with self.assertRaisesRegexp( ValueError, 'Not found: could not load fasta and/or fai for fasta ' + fasta): reference.IndexedFastaReader.from_file(fasta, fai)
def setUp(self): self.sites_reader = vcf.VcfReader( test_utils.genomics_core_testdata('test_sites.vcf'), use_index=False) self.samples_reader = vcf.VcfReader( test_utils.genomics_core_testdata('test_samples.vcf.gz'), use_index=True)
def test_iterate(self, fasta_filename): # Check the indexed fasta file's iterable matches that of the unindexed # fasta file. indexed_fasta_reader = fasta.IndexedFastaReader( test_utils.genomics_core_testdata(fasta_filename)) unindexed_fasta_reader = fasta.UnindexedFastaReader( test_utils.genomics_core_testdata(fasta_filename)) self.assertEqual(list(indexed_fasta_reader.iterate()), list(unindexed_fasta_reader.iterate()))
def setUp(self): self.unindexed_options = variants_pb2.VcfReaderOptions() self.indexed_options = variants_pb2.VcfReaderOptions( index_mode=index_pb2.INDEX_BASED_ON_FILENAME) self.sites_vcf = test_utils.genomics_core_testdata('test_sites.vcf') self.samples_vcf = test_utils.genomics_core_testdata('test_samples.vcf.gz') self.sites_reader = vcf_reader.VcfReader.from_file(self.sites_vcf, self.unindexed_options) self.samples_reader = vcf_reader.VcfReader.from_file( self.samples_vcf, self.indexed_options)
def _make_reader(self, filename, has_embedded_ref): if has_embedded_ref: # If we have an embedded reference, force the reader to use it by not # providing an argument for ref_path. return sam.SamReader(test_utils.genomics_core_testdata(filename)) else: # Otherwise we need to explicitly override the reference encoded in the UR # of the CRAM file to use the path provided to our test.fasta. return sam.SamReader( test_utils.genomics_core_testdata(filename), ref_path=test_utils.genomics_core_testdata('test.fasta'))
def test_dispatching_reader(self): with fasta.FastaReader( test_utils.genomics_core_testdata('test.fasta')) as reader: # The reader is an instance of IndexedFastaReader which supports query(). self.assertEqual(reader.query(ranges.make_range('chrM', 1, 6)), 'ATCAC') with fasta.FastaReader( test_utils.genomics_core_testdata('unindexed.fasta')) as reader: # The reader is an instance of UnindexedFastaReader which doesn't support # query(). with self.assertRaises(NotImplementedError): reader.query(ranges.make_range('chrM', 1, 5))
def test_sam_iterate_raises_on_malformed_record(self): malformed = test_utils.genomics_core_testdata('malformed.sam') reader = sam_reader.SamReader.from_file(malformed, self.options) iterable = iter(reader.iterate()) self.assertIsNotNone(next(iterable)) with self.assertRaises(ValueError): list(iterable)
def test_c_reader(self): self.assertNotEqual(self.sites_reader.c_reader, 0) self.assertNotEqual(self.samples_reader.c_reader, 0) tfrecord_reader = vcf.VcfReader( test_utils.genomics_core_testdata('test_samples.vcf.golden.tfrecord')) self.assertNotEqual(tfrecord_reader.c_reader, 0)
def setUp(self): super(TabixTest, self).setUp() self.input_file = test_utils.genomics_core_testdata('test_samples.vcf.gz') self.output_file = test_utils.test_tmpfile('test_samples.vcf.gz') shutil.copyfile(self.input_file, self.output_file) self.tbx_index_file = self.output_file + '.tbi' self.csi_index_file = self.output_file + '.csi'
def test_roundtrip(self, expected_infos, expected_fmt, expected_fmt1, expected_fmt2, reader_excluded_info=None, reader_excluded_format=None, writer_excluded_info=None, writer_excluded_format=None): expected_records = [ record.format(info=info, fmt=expected_fmt, efmts1=e1, efmts2=e2) for record, info, e1, e2 in zip( self.record_format_strings, expected_infos, expected_fmt1, expected_fmt2) ] expected = self.header + ''.join(expected_records) with vcf.VcfReader( test_utils.genomics_core_testdata('test_py_roundtrip.vcf'), use_index=False, excluded_info_fields=reader_excluded_info, excluded_format_fields=reader_excluded_format) as reader: records = list(reader.iterate()) output_path = test_utils.test_tmpfile('test_roundtrip_tmpfile.vcf') with vcf.VcfWriter( output_path, header=reader.header, excluded_info_fields=writer_excluded_info, excluded_format_fields=writer_excluded_format) as writer: for record in records: writer.write(record) with open(output_path) as f: actual = f.read() self.assertEqual(actual, expected)
def test_round_trip_vcf(self, test_datum_name): # Round-trip variants through writing and reading: # 1. Read variants v1 from VcfReader; # 2. Write v1 to vcf using our VcfWriter; # 3. Read back in using VcfReader -- v2; # 4. compare v1 and v2. in_file = test_utils.genomics_core_testdata(test_datum_name) out_file = test_utils.test_tmpfile('output_' + test_datum_name) v1_reader = vcf.VcfReader(in_file, use_index=False) v1_records = list(v1_reader.iterate()) self.assertTrue(v1_records, 'Reader failed to find records') header = copy.deepcopy(v1_reader.header) writer_options = variants_pb2.VcfWriterOptions() with vcf_writer.VcfWriter.to_file(out_file, header, writer_options) as writer: for record in v1_records: writer.write(record) v2_reader = vcf.VcfReader(out_file, use_index=False) v2_records = list(v2_reader.iterate()) self.assertEqual(v1_records, v2_records, 'Round-tripped variants not as expected')
def test_sam_iterate_raises_on_malformed_record(self): malformed = test_utils.genomics_core_testdata('malformed.sam') reader = sam_reader.SamReader.from_file(malformed, self.options) iterable = iter(reader.iterate()) self.assertIsNotNone(next(iterable)) with self.assertRaises(ValueError): list(iterable)
def test_native_gff_header(self, gff_filename): gff_path = test_utils.genomics_core_testdata(gff_filename) with gff.GffReader(gff_path) as reader: self.assertEqual(EXPECTED_GFF_VERSION, reader.header.gff_version) with gff.NativeGffReader(gff_path) as native_reader: self.assertEqual(EXPECTED_GFF_VERSION, native_reader.header.gff_version)
def test_round_trip_vcf(self, test_datum_name): # Round-trip variants through writing and reading: # 1. Read variants v1 from VcfReader; # 2. Write v1 to vcf using our VcfWriter; # 3. Read back in using VcfReader -- v2; # 4. compare v1 and v2. in_file = test_utils.genomics_core_testdata(test_datum_name) out_file = test_utils.test_tmpfile('output_' + test_datum_name) v1_reader = vcf.VcfReader(in_file) v1_records = list(v1_reader.iterate()) self.assertTrue(v1_records, 'Reader failed to find records') header = copy.deepcopy(v1_reader.header) writer_options = variants_pb2.VcfWriterOptions() with vcf_writer.VcfWriter.to_file(out_file, header, writer_options) as writer: for record in v1_records: writer.write(record) v2_reader = vcf.VcfReader(out_file) v2_records = list(v2_reader.iterate()) self.assertEqual(v1_records, v2_records, 'Round-tripped variants not as expected')
def test_headless_sam_raises(self): headerless = test_utils.genomics_core_testdata('headerless.sam') with self.assertRaisesRegex( ValueError, 'Could not parse file with bad SAM header'): sam_reader.SamReader.from_file(reads_path=headerless, ref_path='', options=self.options)
def setUp(self): tfrecord_file = test_utils.genomics_core_testdata( 'test_features.gff.tfrecord') self.records = list( io_utils.read_tfrecords(tfrecord_file, proto=gff_pb2.GffRecord)) self.header = gff_pb2.GffHeader( sequence_regions=[ranges.make_range('ctg123', 0, 1497228)])
def test_query_without_index_raises(self, unindexed_file_name): path = test_utils.genomics_core_testdata(unindexed_file_name) window = ranges.parse_literal('chr20:10,000,000-10,000,100') with sam_reader.SamReader.from_file( reads_path=path, ref_path='', options=self.options) as reader: with self.assertRaisesRegex(ValueError, 'Cannot query without an index'): reader.query(window)
def test_roundtrip(self, expected_infos, expected_fmt, expected_fmt1, expected_fmt2, reader_excluded_info=None, reader_excluded_format=None, writer_excluded_info=None, writer_excluded_format=None): expected_records = [ record.format(info=info, fmt=expected_fmt, efmts1=e1, efmts2=e2) for record, info, e1, e2 in zip( self.record_format_strings, expected_infos, expected_fmt1, expected_fmt2) ] expected = self.header + ''.join(expected_records) with vcf.VcfReader( test_utils.genomics_core_testdata('test_py_roundtrip.vcf'), excluded_info_fields=reader_excluded_info, excluded_format_fields=reader_excluded_format) as reader: records = list(reader.iterate()) output_path = test_utils.test_tmpfile('test_roundtrip_tmpfile.vcf') with vcf.VcfWriter( output_path, header=reader.header, excluded_info_fields=writer_excluded_info, excluded_format_fields=writer_excluded_format) as writer: for record in records: writer.write(record) with open(output_path) as f: actual = f.read() self.assertEqual(actual, expected)
def test_bed_iterate_raises_on_malformed_record(self, filename): malformed = test_utils.genomics_core_testdata(filename) reader = bed_reader.BedReader.from_file(malformed, self.options) iterable = iter(reader.iterate()) self.assertIsNotNone(next(iterable)) with self.assertRaises(ValueError): list(iterable)
def test_bed_iterate_raises_on_malformed_record(self, filename): malformed = test_utils.genomics_core_testdata(filename) reader = bed_reader.BedReader.from_file(malformed, self.options) iterable = iter(reader.iterate()) self.assertIsNotNone(next(iterable)) with self.assertRaises(ValueError): list(iterable)
def test_conversion_to_tfrecord_and_back(self, original_input_file): """Test conversion from a native file format to tfrecord.gz, then back.""" input_path = test_utils.genomics_core_testdata(original_input_file) tfrecord_output_path = test_utils.test_tmpfile(original_input_file + ".tfrecord.gz") native_output_path = test_utils.test_tmpfile(original_input_file) # Test conversion from native format to tfrecord. self._convert(input_path, tfrecord_output_path) # redacted if native_output_path.endswith(".sam"): raise unittest.SkipTest("SAM writing not yet supported") # Test conversion from tfrecord format back to native format. Ensure that # conversions where we would need a header, but don't have one from the # input, trigger an error message. if any( native_output_path.endswith(ext) for ext in FORMATS_REQUIRING_HEADER): with self.assertRaisesRegexp( converter.ConversionError, "Input file does not have a header, which is needed to construct " "output file"): self._convert(tfrecord_output_path, native_output_path) else: self._convert(tfrecord_output_path, native_output_path)
def test_from_regions(self, regions, expected): # For convenience we allow 'test.bed' in our regions but the actual file # path is in our testdata directory. for i in range(len(regions)): if regions[i] == 'test.bed': regions[i] = test_utils.genomics_core_testdata('test.bed') self.assertEqual(list(ranges.from_regions(regions)), expected)
def test_ops_on_closed_reader_raise(self): file_path = test_utils.genomics_core_testdata('test_features.gff') reader = gff_reader.GffReader.from_file(file_path, self.options) with reader: pass # At this point the reader is closed. with self.assertRaisesRegexp(ValueError, 'Cannot Iterate a closed'): reader.iterate()
def test_query_on_unindexed_reader_raises(self): window = ranges.parse_literal('chr1:10,000,000-10,000,100') unindexed_file = test_utils.genomics_core_testdata('test_samples.vcf') with vcf_reader.VcfReader.from_file(unindexed_file, self.options) as reader: with self.assertRaisesRegexp(ValueError, 'Cannot query without an index'): reader.query(window)
def test_headless_sam_raises(self): headerless = test_utils.genomics_core_testdata('headerless.sam') reader = sam_reader.SamReader.from_file(reads_path=headerless, ref_path='', options=self.options) iterable = iter(reader.iterate()) with self.assertRaises(ValueError): next(iterable)
def test_from_bed(self, bed_filename): source = test_utils.genomics_core_testdata(bed_filename) self.assertCountEqual([ ranges.make_range('chr1', 1, 10), ranges.make_range('chr2', 20, 30), ranges.make_range('chr2', 40, 60), ranges.make_range('chr3', 80, 90), ], ranges.RangeSet.from_bed(source))
def setUp(self): self.bed = test_utils.genomics_core_testdata('test_regions.bed') self.zipped_bed = test_utils.genomics_core_testdata( 'test_regions.bed.gz') self.options = bed_pb2.BedReaderOptions() self.first = bed_pb2.BedRecord(reference_name='chr1', start=10, end=20, name='first', score=100, strand=bed_pb2.BedRecord.FORWARD_STRAND, thick_start=12, thick_end=18, item_rgb='255,124,1', block_count=3, block_sizes='2,6,2', block_starts='10,12,18')
def testCompressed(self): reader = genomics_reader.TFRecordReader( test_utils.genomics_core_testdata('test_features.gff.tfrecord.gz'), gff_pb2.GffRecord(), ) records = list(reader.iterate()) self.assertEqual('GenBank', records[0].source) self.assertEqual('ctg123', records[1].range.reference_name)
def test_sam_query(self): reader = sam.SamReader(test_utils.genomics_core_testdata('test.bam')) expected = [(ranges.parse_literal('chr20:10,000,000-10,000,100'), 106), (ranges.parse_literal('chr20:10,000,000-10,000,000'), 45)] with reader: for interval, n_expected in expected: with reader.query(interval) as iterable: self.assertEqual(test_utils.iterable_len(iterable), n_expected)
def setUp(self): self.bed = test_utils.genomics_core_testdata('test_regions.bed') self.zipped_bed = test_utils.genomics_core_testdata('test_regions.bed.gz') self.options = bed_pb2.BedReaderOptions() self.first = bed_pb2.BedRecord( reference_name='chr1', start=10, end=20, name='first', score=100, strand=bed_pb2.BedRecord.FORWARD_STRAND, thick_start=12, thick_end=18, item_rgb='255,124,1', block_count=3, block_sizes='2,6,2', block_starts='10,12,18')
def test_iterate_bed_reader(self, bed_filename): bed_path = test_utils.genomics_core_testdata(bed_filename) expected = [('chr1', 10, 20), ('chr1', 100, 200)] with bed.BedReader(bed_path) as reader: records = list(reader.iterate()) self.assertLen(records, 2) self.assertEqual([(r.reference_name, r.start, r.end) for r in records], expected)
def test_gff_iterate(self, test_features_gff_filename): file_path = test_utils.genomics_core_testdata(test_features_gff_filename) with gff_reader.GffReader.from_file(file_path, self.options) as reader: iterable = reader.iterate() self.assertIsInstance(iterable, clif_postproc.WrappedCppIterable) actual = list(iterable) self.assertLen(actual, 2) self.assertEqual(actual[0], self.first) self.assertEqual(actual[1], self.second)
def test_sam_query(self): reader = sam.SamReader( test_utils.genomics_core_testdata('test.bam')) expected = [(ranges.parse_literal('chr20:10,000,000-10,000,100'), 106), (ranges.parse_literal('chr20:10,000,000-10,000,000'), 45)] with reader: for interval, n_expected in expected: with reader.query(interval) as iterable: self.assertEqual(test_utils.iterable_len(iterable), n_expected)
def test_iterate_fastq_reader(self, fastq_filename): fastq_path = test_utils.genomics_core_testdata(fastq_filename) expected_ids = [ 'NODESC:header', 'M01321:49:000000000-A6HWP:1:1101:17009:2216', 'FASTQ' ] with fastq.FastqReader(fastq_path) as reader: records = list(reader.iterate()) self.assertLen(records, 3) self.assertEqual([r.id for r in records], expected_ids)
def test_iterate_gff_reader(self, gff_filename): gff_path = test_utils.genomics_core_testdata(gff_filename) expected = [('ctg123', 999, 9000), ('ctg123', 999, 1012)] with gff.GffReader(gff_path) as reader: records = list(reader.iterate()) self.assertLen(records, 2) self.assertEqual([(r.range.reference_name, r.range.start, r.range.end) for r in records], expected)
def setUpClass(cls): cls.fasta_reader = fasta.RefFastaReader( test_utils.genomics_core_testdata('test.fasta')) cls.in_mem = fasta.InMemoryRefReader( [(contig.name, 0, cls.fasta_reader.query( ranges.make_range(contig.name, 0, contig.n_bases))) for contig in cls.fasta_reader.header.contigs])
def test_bam_iterate_partially(self): """Verify that iteration provides results incrementally, not all at once.""" reader = sam.SamReader( test_utils.genomics_core_testdata('test.bam'), use_index=False) with reader: iterable = reader.iterate() # We expect 106 records in total. for _ in xrange(10): results = list(itertools.islice(iterable, 10)) self.assertEqual(len(results), 10) results = list(itertools.islice(iterable, 10)) self.assertEqual(len(results), 6)
def test_downsampling(self, method, maybe_range, fraction, expected_n_reads): reader = sam.SamReader( test_utils.genomics_core_testdata('test.bam'), downsample_fraction=fraction, random_seed=12345) with reader: if method == 'iterate': reads_iter = reader.iterate() elif method == 'query': reads_iter = reader.query(ranges.parse_literal(maybe_range)) else: self.fail('Unexpected method', method) self.assertEqual(test_utils.iterable_len(reads_iter), expected_n_reads)
def test_wrap(self, fasta_filename): chr_names = ['chrM', 'chr1', 'chr2'] chr_lengths = [100, 76, 121] fasta = test_utils.genomics_core_testdata(fasta_filename) fai = test_utils.genomics_core_testdata(fasta_filename + '.fai') with reference_fai.GenomeReferenceFai.from_file(fasta, fai) as ref: self.assertEqual(ref.n_contigs, 3) self.assertIn(fasta, ref.fasta_path) self.assertIn('GenomeReference backed by htslib FAI index', str(ref)) self.assertEqual(ref.contig_names, chr_names) self.assertEqual(ref.n_bp, sum(chr_lengths)) self.assertEqual(ref.bases(ranges.make_range('chrM', 1, 10)), 'ATCACAGGT') self.assertTrue(ref.is_valid_interval(ranges.make_range('chrM', 1, 10))) self.assertFalse( ref.is_valid_interval(ranges.make_range('chrM', 1, 100000))) self.assertEqual(len(ref.contigs), 3) self.assertEqual([c.name for c in ref.contigs], chr_names) self.assertEqual([c.n_bases for c in ref.contigs], chr_lengths) for contig in ref.contigs: self.assertEqual(ref.contig(contig.name), contig) self.assertTrue(ref.has_contig(contig.name)) self.assertFalse(ref.has_contig(contig.name + '.unknown'))
def test_writing_canned_variants(self): """Tests writing all the variants that are 'canned' in our tfrecord file.""" # This file is in the TF record format tfrecord_file = test_utils.genomics_core_testdata( 'test_samples.vcf.golden.tfrecord') writer_options = variants_pb2.VcfWriterOptions() header = variants_pb2.VcfHeader( contigs=[ reference_pb2.ContigInfo(name='chr1', n_bases=248956422), reference_pb2.ContigInfo(name='chr2', n_bases=242193529), reference_pb2.ContigInfo(name='chr3', n_bases=198295559), reference_pb2.ContigInfo(name='chrX', n_bases=156040895) ], sample_names=['NA12878_18_99'], filters=[ variants_pb2.VcfFilterInfo( id='PASS', description='All filters passed'), variants_pb2.VcfFilterInfo(id='LowQual', description=''), variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL95.00to96.00'), variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL96.00to97.00'), variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL97.00to99.00'), variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.00to99.50'), variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.50to99.90'), variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.90to99.95'), variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.95to100.00+'), variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.95to100.00'), variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.50to99.60'), variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.60to99.80'), variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.80to99.90'), variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.90to99.95'), variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.95to100.00+'), variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.95to100.00'), ], infos=[ variants_pb2.VcfInfo( id='END', number='1', type='Integer', description='Stop position of the interval') ], formats=[ variants_pb2.VcfFormatInfo( id='GT', number='1', type='String', description='Genotype'), variants_pb2.VcfFormatInfo( id='GQ', number='1', type='Integer', description='Genotype Quality'), variants_pb2.VcfFormatInfo( id='DP', number='1', type='Integer', description='Read depth of all passing filters reads.'), variants_pb2.VcfFormatInfo( id='MIN_DP', number='1', type='Integer', description='Minimum DP observed within the GVCF block.'), variants_pb2.VcfFormatInfo( id='AD', number='R', type='Integer', description= 'Read depth of all passing filters reads for each allele.'), variants_pb2.VcfFormatInfo( id='VAF', number='A', type='Float', description='Variant allele fractions.'), variants_pb2.VcfFormatInfo( id='GL', number='G', type='Float', description='Genotype likelihoods, log10 encoded'), variants_pb2.VcfFormatInfo( id='PL', number='G', type='Integer', description='Genotype likelihoods, Phred encoded'), ], ) variant_records = list( io_utils.read_tfrecords(tfrecord_file, proto=variants_pb2.Variant)) out_fname = test_utils.test_tmpfile('output.vcf') with vcf_writer.VcfWriter.to_file(out_fname, header, writer_options) as writer: for record in variant_records[:5]: writer.write(record) # Check: are the variants written as expected? # pylint: disable=line-too-long expected_vcf_content = [ '##fileformat=VCFv4.2\n', '##FILTER=<ID=PASS,Description="All filters passed">\n', '##FILTER=<ID=LowQual,Description="">\n', '##FILTER=<ID=VQSRTrancheINDEL95.00to96.00,Description="">\n', '##FILTER=<ID=VQSRTrancheINDEL96.00to97.00,Description="">\n', '##FILTER=<ID=VQSRTrancheINDEL97.00to99.00,Description="">\n', '##FILTER=<ID=VQSRTrancheINDEL99.00to99.50,Description="">\n', '##FILTER=<ID=VQSRTrancheINDEL99.50to99.90,Description="">\n', '##FILTER=<ID=VQSRTrancheINDEL99.90to99.95,Description="">\n', '##FILTER=<ID=VQSRTrancheINDEL99.95to100.00+,Description="">\n', '##FILTER=<ID=VQSRTrancheINDEL99.95to100.00,Description="">\n', '##FILTER=<ID=VQSRTrancheSNP99.50to99.60,Description="">\n', '##FILTER=<ID=VQSRTrancheSNP99.60to99.80,Description="">\n', '##FILTER=<ID=VQSRTrancheSNP99.80to99.90,Description="">\n', '##FILTER=<ID=VQSRTrancheSNP99.90to99.95,Description="">\n', '##FILTER=<ID=VQSRTrancheSNP99.95to100.00+,Description="">\n', '##FILTER=<ID=VQSRTrancheSNP99.95to100.00,Description="">\n', '##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of ' 'the interval">\n', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n', '##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">\n', '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth of all ' 'passing filters reads.">\n', '##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP ' 'observed within the GVCF block.">\n', '##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Read depth of all ' 'passing filters reads for each allele.">\n', '##FORMAT=<ID=VAF,Number=A,Type=Float,Description=\"Variant allele ' 'fractions.">\n', '##FORMAT=<ID=GL,Number=G,Type=Float,Description="Genotype ' 'likelihoods, log10 encoded">\n', '##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Genotype ' 'likelihoods, Phred encoded">\n', '##contig=<ID=chr1,length=248956422>\n', '##contig=<ID=chr2,length=242193529>\n', '##contig=<ID=chr3,length=198295559>\n', '##contig=<ID=chrX,length=156040895>\n', '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tNA12878_18_99\n', 'chr1\t13613\t.\tT\tA\t39.88\tVQSRTrancheSNP99.90to99.95\t.\tGT:GQ:DP:AD:PL\t0/1:16:4:1,3:68,0,16\n', 'chr1\t13813\t.\tT\tG\t90.28\tPASS\t.\tGT:GQ:DP:AD:PL\t1/1:9:3:0,3:118,9,0\n', 'chr1\t13838\trs28428499\tC\tT\t62.74\tPASS\t.\tGT:GQ:DP:AD:PL\t1/1:6:2:0,2:90,6,0\n', 'chr1\t14397\trs756427959\tCTGT\tC\t37.73\tPASS\t.\tGT:GQ:DP:AD:PL\t0/1:75:5:3,2:75,0,152\n', 'chr1\t14522\t.\tG\tA\t49.77\tVQSRTrancheSNP99.60to99.80\t.\tGT:GQ:DP:AD:PL\t0/1:78:10:6,4:78,0,118\n' ] # pylint: enable=line-too-long with tf.gfile.GFile(out_fname, 'r') as f: self.assertEqual(f.readlines(), expected_vcf_content)
def setUp(self): self.bam = test_utils.genomics_core_testdata('test.bam') self.options = reads_pb2.SamReaderOptions() self.indexed_options = reads_pb2.SamReaderOptions( index_mode=index_pb2.INDEX_BASED_ON_FILENAME)
def test_bam_iterate(self): reader = sam.SamReader( test_utils.genomics_core_testdata('test.bam'), use_index=False) with reader: self.assertEqual(test_utils.iterable_len(reader.iterate()), 106)
def test_headless_sam_raises(self): headerless = test_utils.genomics_core_testdata('headerless.sam') reader = sam_reader.SamReader.from_file(headerless, self.options) iterable = iter(reader.iterate()) with self.assertRaises(ValueError): next(iterable)
def test_make_ref_reader_default(self, fasta_filename): fasta_path = test_utils.genomics_core_testdata(fasta_filename) with fasta.RefFastaReader(fasta_path) as reader: self.assertEqual(reader.query(ranges.make_range('chrM', 1, 6)), 'ATCAC')
def test_make_ref_reader_cache_specified(self, fasta_filename): fasta_path = test_utils.genomics_core_testdata(fasta_filename) with fasta.RefFastaReader(fasta_path, cache_size=10) as reader: self.assertEqual(reader.query(ranges.make_range('chrM', 1, 5)), 'ATCA')
def test_from_file_raises_with_missing_index(self): with self.assertRaisesRegexp(ValueError, 'Not found: No index found for'): sam_reader.SamReader.from_file( test_utils.genomics_core_testdata('unindexed.bam'), self.indexed_options)
def test_from_file_raises_with_missing_index(self): with self.assertRaisesRegexp(ValueError, 'Not found: No index found for'): vcf_reader.VcfReader.from_file( test_utils.genomics_core_testdata('test_sites.vcf'), self.indexed_options)
def setUp(self): self.vcf_reader = vcf.VcfReader( test_utils.genomics_core_testdata('test_sites.vcf'), use_index=False) self.cache = self.vcf_reader.field_access_cache