Exemplo n.º 1
0
    def test_conversion_to_tfrecord_and_back(self, original_input_file):
        """Test conversion from a native file format to tfrecord.gz, then back."""
        input_path = test_utils.genomics_core_testdata(original_input_file)
        tfrecord_output_path = test_utils.test_tmpfile(original_input_file +
                                                       ".tfrecord.gz")
        native_output_path = test_utils.test_tmpfile(original_input_file)

        # Test conversion from native format to tfrecord.
        self._convert(input_path, tfrecord_output_path)

        # TODO(b/63133103): remove this when SAM writer is implemented.
        if native_output_path.endswith(".sam"):
            raise unittest.SkipTest("SAM writing not yet supported")

        # Test conversion from tfrecord format back to native format.  Ensure that
        # conversions where we would need a header, but don't have one from the
        # input, trigger an error message.
        if any(
                native_output_path.endswith(ext)
                for ext in FORMATS_REQUIRING_HEADER):
            with self.assertRaisesRegexp(
                    converter.ConversionError,
                    "Input file does not have a header, which is needed to construct "
                    "output file"):
                self._convert(tfrecord_output_path, native_output_path)

        else:
            self._convert(tfrecord_output_path, native_output_path)
Exemplo n.º 2
0
 def testGlobListShardedFilePatterns(self, specs, expected_files):
   # First, create all expected_files so Glob will work later.
   expected_full_files = [test_utils.test_tmpfile(f, '')
                          for f in expected_files]
   # Create the full spec names. This one doesn't create the files.
   full_specs = ','.join(
       [test_utils.test_tmpfile(spec) for spec in specs.split(',')])
   self.assertEqual(sorted(set(expected_full_files)),
                    io.glob_list_sharded_file_patterns(full_specs))
Exemplo n.º 3
0
 def setUp(self):
     super(TabixTest, self).setUp()
     self.input_file = test_utils.genomics_core_testdata(
         'test_samples.vcf.gz')
     self.output_file = test_utils.test_tmpfile('test_samples.vcf.gz')
     shutil.copyfile(self.input_file, self.output_file)
     self.tbx_index_file = self.output_file + '.tbi'
Exemplo n.º 4
0
  def test_round_trip_vcf(self, test_datum_name):
    # Round-trip variants through writing and reading:
    # 1. Read variants v1 from VcfReader;
    # 2. Write v1 to vcf using our VcfWriter;
    # 3. Read back in using VcfReader -- v2;
    # 4. compare v1 and v2.
    in_file = test_utils.genomics_core_testdata(test_datum_name)
    out_file = test_utils.test_tmpfile('output_' + test_datum_name)

    v1_reader = vcf.VcfReader(in_file)
    v1_records = list(v1_reader.iterate())
    self.assertTrue(v1_records, 'Reader failed to find records')

    header = copy.deepcopy(v1_reader.header)
    writer_options = variants_pb2.VcfWriterOptions()

    with vcf_writer.VcfWriter.to_file(out_file, header,
                                      writer_options) as writer:
      for record in v1_records:
        writer.write(record)

    v2_reader = vcf.VcfReader(out_file)
    v2_records = list(v2_reader.iterate())

    self.assertEqual(v1_records, v2_records,
                     'Round-tripped variants not as expected')
Exemplo n.º 5
0
    def test_roundtrip(self,
                       expected_infos,
                       expected_fmt,
                       expected_fmt1,
                       expected_fmt2,
                       reader_excluded_info=None,
                       reader_excluded_format=None,
                       writer_excluded_info=None,
                       writer_excluded_format=None):
        expected_records = [
            record.format(info=info, fmt=expected_fmt, efmts1=e1,
                          efmts2=e2) for record, info, e1, e2 in zip(
                              self.record_format_strings, expected_infos,
                              expected_fmt1, expected_fmt2)
        ]
        expected = self.header + ''.join(expected_records)
        with vcf.VcfReader(
                test_utils.genomics_core_testdata('test_py_roundtrip.vcf'),
                excluded_info_fields=reader_excluded_info,
                excluded_format_fields=reader_excluded_format) as reader:

            records = list(reader.iterate())
            output_path = test_utils.test_tmpfile('test_roundtrip_tmpfile.vcf')
            with vcf.VcfWriter(
                    output_path,
                    header=reader.header,
                    excluded_info_fields=writer_excluded_info,
                    excluded_format_fields=writer_excluded_format) as writer:
                for record in records:
                    writer.write(record)

        with open(output_path) as f:
            actual = f.read()
        self.assertEqual(actual, expected)
Exemplo n.º 6
0
 def setUp(self):
   self.out_fname = test_utils.test_tmpfile('output.vcf')
   self.header = variants_pb2.VcfHeader(
       contigs=[
           reference_pb2.ContigInfo(name='Chr1', n_bases=50, pos_in_fasta=0),
           reference_pb2.ContigInfo(name='Chr2', n_bases=25, pos_in_fasta=1),
       ],
       sample_names=['Fido', 'Spot'],
       formats=[
           variants_pb2.VcfFormatInfo(
               id='GT', number='1', type='String', description='Genotype'),
           variants_pb2.VcfFormatInfo(
               id='GQ',
               number='1',
               type='Float',
               description='Genotype Quality')
       ],
   )
   self.options = variants_pb2.VcfWriterOptions()
   self.writer = vcf_writer.VcfWriter.to_file(self.out_fname, self.header,
                                              self.options)
   self.variant = test_utils.make_variant(
       chrom='Chr1',
       start=10,
       alleles=['A', 'C'],
   )
   self.variant.calls.extend([
       variants_pb2.VariantCall(genotype=[0, 0], call_set_name='Fido'),
       variants_pb2.VariantCall(genotype=[0, 1], call_set_name='Spot'),
   ])
Exemplo n.º 7
0
    def test_main(self):
        examples_out = test_utils.test_tmpfile('output.tfrecord')
        ngs_errors.make_ngs_error_examples(
            ref_path=test_utils.genomics_core_testdata(
                'ucsc.hg19.chr20.unittest.fasta.gz'),
            vcf_path=test_utils.genomics_core_testdata(
                'test_nist.b37_chr20_100kbp_at_10mb.vcf.gz'),
            bam_path=test_utils.genomics_core_testdata(
                'NA12878_S1.chr20.10_10p1mb.bam'),
            examples_out_path=examples_out,
            max_reads=100)

        actual_examples = _read_examples(examples_out)
        golden_examples = _read_examples(
            test_utils.genomics_core_testdata(
                'golden.examples.ngs_errors.tfrecord'))

        self.assertEqual(len(actual_examples), 100)
        assertExamplesAreEqual(self,
                               golden_examples,
                               actual_examples,
                               expected_keys={
                                   'read_name', 'cigar', 'read_sequence',
                                   'read_qualities', 'true_sequence'
                               })
Exemplo n.º 8
0
 def setUp(self):
     writer_options = fastq_pb2.FastqWriterOptions()
     out_fname = test_utils.test_tmpfile('output.fastq')
     self.writer = fastq_writer.FastqWriter.to_file(out_fname,
                                                    writer_options)
     self.expected_fastq_content = [
         '@NODESC:header\n',
         'GATTACA\n',
         '+\n',
         'BB>B@FA\n',
         '@M01321:49:000000000-A6HWP:1:1101:17009:2216 1:N:0:1\n',
         'CGTTAGCGCAGGGGGCATCTTCACACTGGTGACAGGTAACCGCCGTAGTAAAGGTTCCGCCTTTCACT\n',
         '+\n',
         'AAAAABF@BBBDGGGG?FFGFGHBFBFBFABBBHGGGFHHCEFGGGGG?FGFFHEDG3EFGGGHEGHG\n',
         '@FASTQ contains multiple spaces in description\n',
         'CGGCTGGTCAGGCTGACATCGCCGCCGGCCTGCAGCGAGCCGCTGC\n',
         '+\n',
         'FAFAF;F/9;.:/;999B/9A.DFFF;-->.AAB/FC;9-@-=;=.\n',
         '@FASTQ_with_trailing_space\n',
         'CGG\n',
         '+\n',
         'FAD\n',
     ]
     self.record = fastq_pb2.FastqRecord(id='ID',
                                         description='desc',
                                         sequence='ACGTAC',
                                         quality='ABCDEF')
Exemplo n.º 9
0
 def test_roundtrip_writer(self, filename):
   output_path = test_utils.test_tmpfile(filename)
   original_reader = sam.SamReader(test_utils.genomics_core_testdata(filename))
   original_records = list(original_reader.iterate())
   with sam.SamWriter(output_path, header=original_reader.header) as writer:
     for record in original_records:
       writer.write(record)
   with sam.SamReader(output_path) as new_reader:
     self.assertEqual(original_records, list(new_reader.iterate()))
Exemplo n.º 10
0
    def test_writing(self):
        path = test_utils.test_tmpfile('test_gfile')
        with gfile.Open(path, 'w') as f:
            f.write('test\n')
            f.write('end\n')

        with gfile.Open(path, 'r') as f2:
            lines = f2.readlines()

        self.assertEqual(['test\n', 'end\n'], lines)
Exemplo n.º 11
0
    def test_roundtrip_writer(self, filename):
        output_path = test_utils.test_tmpfile(filename)
        with fastq.FastqWriter(output_path) as writer:
            for record in self.records:
                writer.write(record)

        with fastq.FastqReader(output_path) as reader:
            v2_records = list(reader.iterate())

        self.assertEqual(self.records, v2_records)
Exemplo n.º 12
0
    def test_roundtrip_writer(self, filename):
        output_path = test_utils.test_tmpfile(filename)
        with bed.BedWriter(output_path,
                           header=bed_pb2.BedHeader(num_fields=5)) as writer:
            for record in self.records:
                writer.write(record)

        with bed.BedReader(output_path) as reader:
            v2_records = list(reader.iterate())

        self.assertEqual(self.records, v2_records)
Exemplo n.º 13
0
 def _parse_read_with_aux_tags(self, tag_string):
     # Minimal header line to create a valid SAM file.
     header_lines = '@HD\tVN:1.3\tSO:coordinate\n@SQ\tSN:chr1\tLN:248956422\n'
     # A single stock read we'll add our AUX fields to.
     read = 'read_name\t0\tchr1\t1\t0\t3M\t*\t0\t0\tCCC\tAAA\t' + tag_string
     path = test_utils.test_tmpfile('aux_tags.bam')
     with gfile.GFile(path, 'w') as fout:
         fout.write(header_lines)
         fout.write(read + '\n')
     with sam.SamReader(path, parse_aux_fields=True) as reader:
         return list(reader.iterate())
Exemplo n.º 14
0
 def test_bed_parser(self):
     test_bed_path = test_utils.test_tmpfile(
         'test_bed_parser.bed', '\n'.join([
             'chr20\t61724611\t61725646', 'chr20\t61304163\t61305182',
             'chr20\t61286467\t61286789'
         ]))
     self.assertEqual(list(ranges.bed_parser(test_bed_path)), [
         ranges.make_range('chr20', 61724611, 61725646),
         ranges.make_range('chr20', 61304163, 61305182),
         ranges.make_range('chr20', 61286467, 61286789),
     ])
Exemplo n.º 15
0
 def setUp(self):
   out_fname = test_utils.test_tmpfile('output.bed')
   self.writer = bed_writer.BedWriter.to_file(
       out_fname, bed_pb2.BedHeader(num_fields=12), bed_pb2.BedWriterOptions())
   self.expected_bed_content = [
       'chr1\t10\t20\tfirst\t100\t+\t12\t18\t255,124,1\t3\t2,6,2\t10,12,18\n',
       'chr1\t100\t200\tsecond\t250\t.\t120\t180\t252,122,12\t2\t35,40\t'
       '100,160\n'
   ]
   self.record = bed_pb2.BedRecord(
       reference_name='chr1', start=20, end=30, name='r')
Exemplo n.º 16
0
    def test_main(self):
        in_fname = test_utils.genomics_core_testdata('test_vaf.vcf')
        out_fname = test_utils.test_tmpfile('output.vcf')
        filter_vcf.main(['filter_vcf', in_fname, out_fname])

        with vcf.VcfReader(out_fname) as reader:
            variants = list(reader)
            self.assertEqual(3, len(variants))
            self.assertEqual(['DogSNP4', 'DogSNP5', 'DogSNP6'],
                             [v.names[0] for v in variants])
            for v in variants:
                self.assertTrue(v.quality > 3.01)
Exemplo n.º 17
0
 def setUp(self):
     out_fname = test_utils.test_tmpfile('output.gff')
     self.writer = gff_writer.GffWriter.to_file(out_fname,
                                                gff_pb2.GffHeader(),
                                                gff_pb2.GffWriterOptions())
     self.expected_gff_content = open(
         test_utils.genomics_core_testdata(
             'test_features.gff')).readlines()
     self.header = gff_pb2.GffHeader(
         sequence_regions=[ranges.make_range('ctg123', 0, 1497228)])
     self.record = gff_pb2.GffRecord(
         range=ranges.make_range('ctg123', 1000, 1100))
Exemplo n.º 18
0
  def test_roundtrip_num_fields(self, num_fields):
    all_num_fields_in_file = [
        n for n in _VALID_NUM_BED_FIELDS if n >= num_fields
    ]
    for num_fields_in_file in all_num_fields_in_file:
      lines = ['\t'.join(line[:num_fields_in_file]) for line in self.tokens]
      contents = '{}\n'.format('\n'.join(lines))
      input_path = test_utils.test_tmpfile('test_field.bed', contents=contents)

      with bed.BedReader(input_path, num_fields=num_fields) as reader:
        records = list(reader.iterate())
      output_path = test_utils.test_tmpfile('test_field2.bed')
      with bed.BedWriter(output_path, header=reader.header) as writer:
        for record in records:
          writer.write(record)

      with bed.BedReader(output_path) as reader2:
        v2_records = list(reader2.iterate())

      self.assertLen(records, 3)
      self.assertEqual(records, v2_records)
Exemplo n.º 19
0
 def test_bedpe_parser_skips_cross_chr_events(self):
     # pylint: disable=line-too-long
     data = '\n'.join([
         'chr20\t25763416\t25765517\tchr21\t25825181\t25826882\tP2_PM_20_1549\t63266\t+\tTYPE:DELETION',
         'chr20\t25972820\t25972991\tchr20\t26045347\t26045538\tP2_PM_20_696\t72548\t+\tTYPE:DELETION',
         'chr20\t23719873\t23721974\tchr20\t23794822\t23796523\tP2_PM_20_1548\t76450\t+\tTYPE:DELETION',
     ])
     test_bedpe_path = test_utils.test_tmpfile('test_bedpe_parser2.bedpe',
                                               data)
     self.assertEqual(list(ranges.bedpe_parser(test_bedpe_path)), [
         ranges.make_range('chr20', 25972820, 26045538),
         ranges.make_range('chr20', 23719873, 23796523),
     ])
Exemplo n.º 20
0
  def test_headerless_vcf(self):
    """Writes a headerless vcf and reads it back out."""
    test_vcf = test_utils.genomics_core_testdata('test_sites.vcf')
    output_vcf = test_utils.test_tmpfile('output.vcf')
    expected_variants = []
    with vcf.VcfReader(test_vcf) as reader:
      with vcf.VcfWriter(
          output_vcf, header=reader.header, exclude_header=True) as writer:
        for record in reader:
          expected_variants.append(record)
          writer.write(record)

      with vcf.VcfReader(output_vcf, header=reader.header) as actual_reader:
        self.assertEqual(expected_variants, list(actual_reader))
Exemplo n.º 21
0
    def test_get_input_fn(self):
        test_file = test_utils.test_tmpfile('test.tfrecord')

        # Use a simple test example that consists of a read sequence of length 5.
        example = example_pb2.Example()
        read_sequence = 'ACGTA'
        true_sequence = 'ACCTA'
        aligned_qualities = [30, 30, 20, 30, 30]
        features = example.features
        features.feature['read_name'].bytes_list.value.append(
            six.b('test_seq'))
        features.feature['read_sequence'].int64_list.value.extend(
            ['ACGT'.index(b) for b in read_sequence])
        features.feature['read_qualities'].int64_list.value.extend(
            aligned_qualities)
        features.feature['true_sequence'].int64_list.value.extend(
            ['ACGT'.index(b) for b in true_sequence])
        features.feature['ref_match'].int64_list.value.extend([1, 1, 0, 1, 1])

        with genomics_writer.TFRecordWriter(test_file) as writer:
            writer.write(example)

        features, label = ngs_errors.get_input_fn(
            test_file,
            ngs_read_length=len(read_sequence),
            batch_size=1,
            num_epochs=1)()
        with tf.Session() as sess:
            features_val, label_val = sess.run([features, label])
            features_array = np.array(features_val)
            self.assertEqual((1, 4, 5, 3), features_array.shape)
            self.assertTrue(
                np.array_equal(
                    np.array([[[1, 0, 0, 0, 1], [0, 1, 0, 0,
                                                 0], [0, 0, 1, 0, 0],
                               [0, 0, 0, 1, 0]]]), features_array[:, :, :, 0]))
            self.assertTrue(
                np.array_equal(
                    np.array([[[1, 1, 0, 1, 1], [1, 1, 0, 1,
                                                 1], [1, 1, 0, 1, 1],
                               [1, 1, 0, 1, 1]]]), features_array[:, :, :, 1]))
            self.assertTrue(
                np.array_equal(
                    np.array([[[30, 30, 20, 30, 30], [30, 30, 20, 30, 30],
                               [30, 30, 20, 30, 30], [30, 30, 20, 30, 30]]]),
                    features_array[:, :, :, 2]))
            self.assertTrue(
                np.array_equal(np.array([[0, 1, 1, 3, 0]]),
                               np.array(label_val)))
Exemplo n.º 22
0
    def test_make_read_writer_tfrecords(self):
        outfile = test_utils.test_tmpfile('test.tfrecord')
        writer = sam.SamWriter(outfile, header=self.header)

        # Test that the writer is a context manager and that we can write a read to
        # it.
        with writer:
            writer.write(self.read1)
            writer.write(self.read2)

        # Our output should have exactly one read in it.
        self.assertEqual([self.read1, self.read2],
                         list(
                             tfrecord.read_tfrecords(outfile,
                                                     proto=reads_pb2.Read)))
Exemplo n.º 23
0
    def test_writing_canned_records(self):
        """Tests writing all the records that are 'canned' in our tfrecord file."""
        # This file is in TFRecord format.
        tfrecord_file = test_utils.genomics_core_testdata(
            'test_features.gff.tfrecord')
        writer_options = gff_pb2.GffWriterOptions()
        gff_records = list(
            io_utils.read_tfrecords(tfrecord_file, proto=gff_pb2.GffRecord))
        out_fname = test_utils.test_tmpfile('output.gff')
        with gff_writer.GffWriter.to_file(out_fname, self.header,
                                          writer_options) as writer:
            for record in gff_records:
                writer.write(record)

        with open(out_fname) as f:
            self.assertEqual(f.readlines(), self.expected_gff_content)
Exemplo n.º 24
0
  def test_roundtrip_writer(self, bedgraph_path):
    output_path = test_utils.test_tmpfile(bedgraph_path)
    input_path = test_utils.genomics_core_testdata(bedgraph_path)
    records = []
    with bedgraph.BedGraphReader(input_path) as reader:
      records = list(reader.iterate())

    with bedgraph.BedGraphWriter(output_path) as writer:
      for record in records:
        writer.write(record)

    with bedgraph.BedGraphReader(output_path) as reader:
      v2_records = list(reader.iterate())

    self.assertLen(records, 4)
    self.assertEqual(records, v2_records)
Exemplo n.º 25
0
    def test_writing_canned_records(self):
        """Tests writing all the variants that are 'canned' in our tfrecord file."""
        # This file is in TFRecord format.
        tfrecord_file = test_utils.genomics_core_testdata(
            'test_reads.fastq.tfrecord')

        writer_options = fastq_pb2.FastqWriterOptions()
        fastq_records = list(
            tfrecord.read_tfrecords(tfrecord_file,
                                    proto=fastq_pb2.FastqRecord))
        out_fname = test_utils.test_tmpfile('output.fastq')
        with fastq_writer.FastqWriter.to_file(out_fname,
                                              writer_options) as writer:
            for record in fastq_records:
                writer.write(record)

        with gfile.GFile(out_fname, 'r') as f:
            self.assertEqual(f.readlines(), self.expected_fastq_content)
Exemplo n.º 26
0
    def test_writing_canned_records(self):
        """Tests writing all the records that are 'canned' in our tfrecord file."""
        # This file is in TFRecord format.
        tfrecord_file = test_utils.genomics_core_testdata(
            'test_regions.bed.tfrecord')

        header = bed_pb2.BedHeader(num_fields=12)
        writer_options = bed_pb2.BedWriterOptions()
        bed_records = list(
            tfrecord.read_tfrecords(tfrecord_file, proto=bed_pb2.BedRecord))
        out_fname = test_utils.test_tmpfile('output.bed')
        with bed_writer.BedWriter.to_file(out_fname, header,
                                          writer_options) as writer:
            for record in bed_records:
                writer.write(record)

        with gfile.Open(out_fname, 'r') as f:
            self.assertEqual(f.readlines(), self.expected_bed_content)
Exemplo n.º 27
0
 def write_variant_to_tempfile(self, variant):
   output_path = test_utils.test_tmpfile('test.vcf')
   header = variants_pb2.VcfHeader(
       contigs=[reference_pb2.ContigInfo(name='20')],
       sample_names=[call.call_set_name for call in variant.calls],
       formats=[
           variants_pb2.VcfFormatInfo(
               id='DP', number='1', type='Integer', description='Read depth'),
           variants_pb2.VcfFormatInfo(
               id='AD',
               number='R',
               type='Integer',
               description='Read depth for each allele')
       ])
   writer = vcf.VcfWriter(output_path, header=header)
   with writer:
     writer.write(variant)
   return output_path
Exemplo n.º 28
0
  def test_main(self):
    in_fname = test_utils.genomics_core_testdata('test_allele_depth.vcf')
    out_fname = test_utils.test_tmpfile('output.vcf')
    add_ad_to_vcf.main(['add_ad_to_vcf', in_fname, out_fname])

    with vcf.VcfReader(out_fname, use_index=False) as reader:
      info_ids = [info.id for info in reader.header.infos]
      self.assertTrue('AD' in info_ids)
      variant1 = next(reader)
      self.assertEqual([3, 3], variant_utils.get_info(variant1, 'AD', reader))
      variant2 = next(reader)
      self.assertEqual([30, 44], variant_utils.get_info(variant2, 'AD', reader))
      variant3 = next(reader)
      self.assertEqual([15, 4], variant_utils.get_info(variant3, 'AD', reader))
      variant4 = next(reader)
      self.assertEqual([2, 4], variant_utils.get_info(variant4, 'AD', reader))
      variant5 = next(reader)
      self.assertEqual([24, 2], variant_utils.get_info(variant5, 'AD', reader))
Exemplo n.º 29
0
 def test_roundtrip_cram_writer(self, filename, has_embedded_ref):
   output_path = test_utils.test_tmpfile(filename)
   writer_ref_path = test_utils.genomics_core_testdata('test.fasta')
   reader_ref_path = ''
   if not has_embedded_ref:
     reader_ref_path = writer_ref_path
   original_reader = sam.SamReader(
       test_utils.genomics_core_testdata(filename), ref_path=reader_ref_path)
   original_records = list(original_reader.iterate())
   with sam.SamWriter(
       output_path,
       header=original_reader.header,
       ref_path=writer_ref_path,
       embed_ref=has_embedded_ref) as writer:
     for record in original_records:
       writer.write(record)
   with sam.SamReader(output_path, ref_path=reader_ref_path) as new_reader:
     self.assertEqual(original_records, list(new_reader.iterate()))
Exemplo n.º 30
0
    def test_round_trip_fastq(self, test_datum_name):
        # Round-trip FASTQ records through writing and reading:
        # 1. Read records v1 from FastqReader;
        # 2. Write v1 to fastq using our FastqWriter;
        # 3. Read back in using FastqReader -- v2;
        # 4. compare v1 and v2.
        in_file = test_utils.genomics_core_testdata(test_datum_name)
        out_file = test_utils.test_tmpfile('output_' + test_datum_name)

        v1_reader = fastq.FastqReader(in_file)
        v1_records = list(v1_reader.iterate())
        self.assertTrue(v1_records, 'Reader failed to find records')

        writer_options = fastq_pb2.FastqWriterOptions()

        with fastq_writer.FastqWriter.to_file(out_file,
                                              writer_options) as writer:
            for record in v1_records:
                writer.write(record)

        v2_reader = fastq.FastqReader(out_file)
        v2_records = list(v2_reader.iterate())
        self.assertEqual(v1_records, v2_records,
                         'Round-tripped FASTQ files not as expected')