Пример #1
0
  def test_round_trip_vcf(self, test_datum_name):
    # Round-trip variants through writing and reading:
    # 1. Read variants v1 from VcfReader;
    # 2. Write v1 to vcf using our VcfWriter;
    # 3. Read back in using VcfReader -- v2;
    # 4. compare v1 and v2.
    in_file = test_utils.genomics_core_testdata(test_datum_name)
    out_file = test_utils.test_tmpfile('output_' + test_datum_name)

    v1_reader = genomics_io.make_vcf_reader(in_file, use_index=False)
    v1_records = list(v1_reader.iterate())
    self.assertTrue(v1_records, 'Reader failed to find records')

    writer_options = core_pb2.VcfWriterOptions(
        contigs=v1_reader.contigs,
        sample_names=v1_reader.samples,
        filters=v1_reader.filters)

    with vcf_writer.VcfWriter.to_file(out_file, writer_options) as writer:
      for record in v1_records:
        writer.write(record)

    v2_reader = genomics_io.make_vcf_reader(out_file, use_index=False)
    v2_records = list(v2_reader.iterate())

    self.assertEqual(v1_records, v2_records,
                     'Round-tripped variants not as expected')
Пример #2
0
 def write_variant_to_tempfile(self, variant):
   path = test_utils.test_tmpfile('test.vcf')
   writer = genomics_io.make_vcf_writer(
       outfile=path,
       contigs=[core_pb2.ContigInfo(name='20')],
       samples=[call.call_set_name for call in variant.calls],
       filters=[])
   with writer:
     writer.write(variant)
   return path
Пример #3
0
 def _parse_read_with_aux_tags(self, tag_string):
   # Minimal header line to create a valid SAM file.
   header_lines = '@HD	VN:1.3	SO:coordinate\n@SQ	SN:chr1	LN:248956422\n'
   # A single stock read we'll add our AUX fields to.
   read = 'read_name	0	chr1	1	0	3M	*	0	0	CCC	AAA	' + tag_string
   path = test_utils.test_tmpfile('aux_tags.bam')
   with tf.gfile.FastGFile(path, 'w') as fout:
     fout.write(header_lines)
     fout.write(read + '\n')
   with genomics_io.make_sam_reader(
       path, use_index=False, parse_aux_fields=True) as reader:
     return list(reader.iterate())
Пример #4
0
 def setUp(self):
   self.out_fname = test_utils.test_tmpfile('output.vcf')
   self.options = core_pb2.VcfWriterOptions(
       contigs=[
           core_pb2.ContigInfo(name='Chr1', n_bases=50, pos_in_fasta=0),
           core_pb2.ContigInfo(name='Chr2', n_bases=25, pos_in_fasta=1),
       ],
       sample_names=['Fido', 'Spot'],
       filters=[])
   self.writer = vcf_writer.VcfWriter.to_file(self.out_fname, self.options)
   self.variant = test_utils.make_variant(
       chrom='Chr1', start=10, alleles=['A', 'C'])
   self.variant.calls.add(genotype=[0, 0], call_set_name='Fido')
   self.variant.calls.add(genotype=[0, 1], call_set_name='Spot')
Пример #5
0
  def test_make_read_writer_tfrecords(self):
    outfile = test_utils.test_tmpfile('test.tfrecord')
    writer = genomics_io.make_read_writer(outfile=outfile)

    # Test that the writer is a context manager and that we can write a read to
    # it.
    with writer:
      writer.write(self.read1)
      writer.write(self.read2)

    # Our output should have exactly one read in it.
    self.assertEqual([self.read1, self.read2],
                     list(
                         io_utils.read_tfrecords(outfile,
                                                 proto=reads_pb2.Read)))
Пример #6
0
  def test_writing_canned_variants(self):
    """Tests writing all the variants that are 'canned' in our tfrecord file."""

    # This file is in the TF record format
    tfrecord_file = test_utils.genomics_core_testdata(
        'test_samples.vcf.golden.tfrecord')

    writer_options = core_pb2.VcfWriterOptions(
        contigs=[
            core_pb2.ContigInfo(name='chr1', n_bases=248956422),
            core_pb2.ContigInfo(name='chr2', n_bases=242193529),
            core_pb2.ContigInfo(name='chr3', n_bases=198295559),
            core_pb2.ContigInfo(name='chrX', n_bases=156040895)
        ],
        sample_names=['NA12878_18_99'],
        filters=[
            core_pb2.VcfFilterInfo(id='LowQual'),
            core_pb2.VcfFilterInfo(id='VQSRTrancheINDEL95.00to96.00'),
            core_pb2.VcfFilterInfo(id='VQSRTrancheINDEL96.00to97.00'),
            core_pb2.VcfFilterInfo(id='VQSRTrancheINDEL97.00to99.00'),
            core_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.00to99.50'),
            core_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.50to99.90'),
            core_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.90to99.95'),
            core_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.95to100.00+'),
            core_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.95to100.00'),
            core_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.50to99.60'),
            core_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.60to99.80'),
            core_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.80to99.90'),
            core_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.90to99.95'),
            core_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.95to100.00+'),
            core_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.95to100.00'),
        ])

    variant_records = list(
        io_utils.read_tfrecords(tfrecord_file, proto=variants_pb2.Variant))
    out_fname = test_utils.test_tmpfile('output.vcf')
    with vcf_writer.VcfWriter.to_file(out_fname, writer_options) as writer:
      for record in variant_records[:5]:
        writer.write(record)

    # Check: are the variants written as expected?
    # pylint: disable=line-too-long
    expected_vcf_content = [
        '##fileformat=VCFv4.2\n',
        '##FILTER=<ID=PASS,Description="All filters passed">\n',
        '##FILTER=<ID=LowQual,Description="">\n',
        '##FILTER=<ID=VQSRTrancheINDEL95.00to96.00,Description="">\n',
        '##FILTER=<ID=VQSRTrancheINDEL96.00to97.00,Description="">\n',
        '##FILTER=<ID=VQSRTrancheINDEL97.00to99.00,Description="">\n',
        '##FILTER=<ID=VQSRTrancheINDEL99.00to99.50,Description="">\n',
        '##FILTER=<ID=VQSRTrancheINDEL99.50to99.90,Description="">\n',
        '##FILTER=<ID=VQSRTrancheINDEL99.90to99.95,Description="">\n',
        '##FILTER=<ID=VQSRTrancheINDEL99.95to100.00+,Description="">\n',
        '##FILTER=<ID=VQSRTrancheINDEL99.95to100.00,Description="">\n',
        '##FILTER=<ID=VQSRTrancheSNP99.50to99.60,Description="">\n',
        '##FILTER=<ID=VQSRTrancheSNP99.60to99.80,Description="">\n',
        '##FILTER=<ID=VQSRTrancheSNP99.80to99.90,Description="">\n',
        '##FILTER=<ID=VQSRTrancheSNP99.90to99.95,Description="">\n',
        '##FILTER=<ID=VQSRTrancheSNP99.95to100.00+,Description="">\n',
        '##FILTER=<ID=VQSRTrancheSNP99.95to100.00,Description="">\n',
        '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n',
        '##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">\n',
        '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth of all '
        'passing filters reads.">\n',
        '##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Read depth of all '
        'passing filters reads for each allele.">\n',
        '##FORMAT=<ID=VAF,Number=A,Type=Float,Description=\"Variant allele '
        'fractions.">\n',
        '##FORMAT=<ID=GL,Number=G,Type=Float,Description="Genotype '
        'likelihoods, log10 encoded">\n',
        '##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Genotype '
        'likelihoods, Phred encoded">\n',
        '##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of '
        'the interval">\n', '##contig=<ID=chr1,length=248956422>\n',
        '##contig=<ID=chr2,length=242193529>\n',
        '##contig=<ID=chr3,length=198295559>\n',
        '##contig=<ID=chrX,length=156040895>\n',
        '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tNA12878_18_99\n',
        'chr1\t13613\t.\tT\tA\t39.88\tVQSRTrancheSNP99.90to99.95\t.\tGT:GQ:DP:AD:PL\t0/1:16:4:1,3:68,0,16\n',
        'chr1\t13813\t.\tT\tG\t90.28\tPASS\t.\tGT:GQ:DP:AD:PL\t1/1:9:3:0,3:118,9,0\n',
        'chr1\t13838\trs28428499\tC\tT\t62.74\tPASS\t.\tGT:GQ:DP:AD:PL\t1/1:6:2:0,2:90,6,0\n',
        'chr1\t14397\trs756427959\tCTGT\tC\t37.73\tPASS\t.\tGT:GQ:DP:AD:PL\t0/1:75:5:3,2:75,0,152\n',
        'chr1\t14522\t.\tG\tA\t49.77\tVQSRTrancheSNP99.60to99.80\t.\tGT:GQ:DP:AD:PL\t0/1:78:10:6,4:78,0,118\n'
    ]
    # pylint: enable=line-too-long

    with tf.gfile.GFile(out_fname, 'r') as f:
      self.assertEqual(f.readlines(), expected_vcf_content)
Пример #7
0
 def write_test_protos(self, filename):
     protos = [core_pb2.ContigInfo(name=str(i)) for i in range(10)]
     path = test_utils.test_tmpfile(filename)
     io.write_tfrecords(protos, path)
     return protos, path