Пример #1
0
    def test_use_of_representative_header(self):
        # Info field `HU` is defined as Float in file header while data is String.
        # This results in parser failure. We test if parser completes successfully
        # when a representative headers with String definition for field `HU` is
        # given.
        file_content = [
            '##INFO=<ID=HU,Number=.,Type=Float,Description="Info">\n',
            '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\r\n',
            '#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	Sample1	Sample2\r\n',
            '19	2	.	A	T	.	.	HU=a,b	GT	0/0	0/1\n',
        ]
        representative_header_lines = [
            '##INFO=<ID=HU,Number=.,Type=String,Description="Info">\n',
            '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\r\n',
        ]
        variant = Variant(reference_name='19',
                          start=1,
                          end=2,
                          reference_bases='A',
                          alternate_bases=['T'],
                          info={'HU': ['a', 'b']})
        variant.calls.append(VariantCall(name='Sample1', genotype=[0, 0]))
        variant.calls.append(VariantCall(name='Sample2', genotype=[0, 1]))

        # `file_headers` is used.
        with self.assertRaises(ValueError):
            read_data = self._create_temp_file_and_read_records(file_content)

        # `representative_header` is used.
        read_data = self._create_temp_file_and_read_records(
            file_content, representative_header_lines)
        self.assertEqual(1, len(read_data))
        self._assert_variants_equal([variant], read_data)
Пример #2
0
 def test_custom_phaseset(self):
     phaseset_header_line = (
         '##FORMAT=<ID=PS,Number=1,Type=Integer,Description="Phaseset">\n')
     record_lines = [
         '19	123	.	A	T	.	.	.	GT:PS	1|0:1111	0/1:.\n',
         '19	121	.	A	T	.	.	.	GT:PS	1|0:2222	0/1:2222\n'
     ]
     variant_1 = Variant(reference_name='19',
                         start=122,
                         end=123,
                         reference_bases='A',
                         alternate_bases=['T'])
     variant_1.calls.append(
         VariantCall(name='Sample1', genotype=[1, 0], phaseset='1111'))
     variant_1.calls.append(VariantCall(name='Sample2', genotype=[0, 1]))
     variant_2 = Variant(reference_name='19',
                         start=120,
                         end=121,
                         reference_bases='A',
                         alternate_bases=['T'])
     variant_2.calls.append(
         VariantCall(name='Sample1', genotype=[1, 0], phaseset='2222'))
     variant_2.calls.append(
         VariantCall(name='Sample2', genotype=[0, 1], phaseset='2222'))
     read_data = self._create_temp_file_and_read_records(
         [phaseset_header_line] + _SAMPLE_HEADER_LINES[1:] + record_lines)
     self.assertEqual(2, len(read_data))
     self._assert_variants_equal([variant_1, variant_2], read_data)
Пример #3
0
  def test_end_info_key_unknown_number_invalid(self):
    end_info_header_line = (
        '##INFO=<ID=END,Number=.,Type=Integer,Description="End of record.">\n')
    # PySam should only take first END field.
    variant = Variant(
        reference_name='19', start=122, end=150, reference_bases='A',
        alternate_bases=['T'])
    variant.calls.append(VariantCall(sample_id=hash_name('Sample1'),
                                     name='Sample1',
                                     genotype=[1, 0]))
    variant.calls.append(VariantCall(sample_id=hash_name('Sample2'),
                                     name='Sample2',
                                     genotype=[0, 1]))
    read_data = self._create_temp_file_and_read_records(
        [end_info_header_line] + _SAMPLE_HEADER_LINES[1:] +
        ['19	123	.	A	T	.	.	END=150,160	GT	1/0	0/1\n'])

    self.assertEqual(1, len(read_data))
    self._assert_variants_equal([variant], read_data)

    # END should be rounded down.
    read_data = self._create_temp_file_and_read_records(
        [end_info_header_line] + _SAMPLE_HEADER_LINES[1:] +
        ['19	123	.	A	T	.	.	END=150.9	GT	1/0	0/1\n'])

    self.assertEqual(1, len(read_data))
    self._assert_variants_equal([variant], read_data)

    # END should not be a string.
    with self.assertRaises(ValueError):
      self._create_temp_file_and_read_records(
          [end_info_header_line] + _SAMPLE_HEADER_LINES[1:] +
          ['19	123	.	A	T	.	.	END=text	GT	1/0	0/1\n'])
Пример #4
0
  def test_format_numbers(self):
    format_headers = [
        '##FORMAT=<ID=FU,Number=.,Type=String,Description="Format_variable">\n',
        '##FORMAT=<ID=F1,Number=1,Type=Integer,Description="Format_1">\n',
        '##FORMAT=<ID=F2,Number=2,Type=Character,Description="Format_2">\n',
        '##FORMAT=<ID=AO,Number=A,Type=Integer,Description="Format_3">\n',
        '##FORMAT=<ID=AD,Number=G,Type=Integer,Description="Format_4">\n',]

    record_lines = [
        ('19	2	.	A	T,C	.	.	.	'
         'GT:FU:F1:F2:AO:AD	1/0:a1:3:a,b:1:3,4	'
         '0/1:a2,a3:4:b,c:1,2:3')]
    expected_variant = Variant(
        reference_name='19', start=1, end=2, reference_bases='A',
        alternate_bases=['T', 'C'])
    expected_variant.calls.append(VariantCall(
        sample_id=hash_name('Sample1'),
        name='Sample1',
        genotype=[1, 0],
        info={'FU': ['a1'], 'F1': 3, 'F2': ['a', 'b'], 'AO': [1],
              'AD': [3, 4]}))
    expected_variant.calls.append(VariantCall(
        sample_id=hash_name('Sample2'),
        name='Sample2',
        genotype=[0, 1],
        info={'FU': ['a2', 'a3'], 'F1': 4, 'F2': ['b', 'c'], 'AO': [1, 2],
              'AD':[3]}))
    read_data = self._create_temp_file_and_read_records(
        format_headers + _SAMPLE_HEADER_LINES[1:] + record_lines)
    self.assertEqual(1, len(read_data))
    self.assertEqual(expected_variant, read_data[0])
Пример #5
0
 def test_info_numbers_and_types(self):
   info_headers = [
       '##INFO=<ID=HA,Number=A,Type=String,Description="StringInfo_A">\n',
       '##INFO=<ID=HG,Number=G,Type=Integer,Description="IntInfo_G">\n',
       '##INFO=<ID=HR,Number=R,Type=Character,Description="ChrInfo_R">\n',
       '##INFO=<ID=HF,Number=0,Type=Flag,Description="FlagInfo">\n',
       '##INFO=<ID=HU,Number=.,Type=Float,Description="FloatInfo_variable">\n']
   record_lines = [
       '19	2	.	A	T,C	.	.	HA=a1,a2;HG=1,2,3;HR=a,b,c;HF;HU=0.1	GT	1/0	0/1\n',
       '19	124	.	A	T	.	.	HG=3,4,5;HR=d,e;HU=1.1,1.2	GT	0/0	0/1']
   variant_1 = Variant(
       reference_name='19', start=1, end=2, reference_bases='A',
       alternate_bases=['T', 'C'],
       info={'HA': ['a1', 'a2'], 'HG': [1, 2, 3], 'HR': ['a', 'b', 'c'],
             'HF': True, 'HU': [0.1]})
   variant_1.calls.append(VariantCall(sample_id=hash_name('Sample1'),
                                      name='Sample1',
                                      genotype=[1, 0]))
   variant_1.calls.append(VariantCall(sample_id=hash_name('Sample2'),
                                      name='Sample2',
                                      genotype=[0, 1]))
   variant_2 = Variant(
       reference_name='19', start=123, end=124, reference_bases='A',
       alternate_bases=['T'],
       info={'HG': [3, 4, 5], 'HR': ['d', 'e'], 'HU': [1.1, 1.2]})
   variant_2.calls.append(VariantCall(sample_id=hash_name('Sample1'),
                                      name='Sample1',
                                      genotype=[0, 0]))
   variant_2.calls.append(VariantCall(sample_id=hash_name('Sample2'),
                                      name='Sample2',
                                      genotype=[0, 1]))
   read_data = self._create_temp_file_and_read_records(
       info_headers + _SAMPLE_HEADER_LINES[1:] + record_lines)
   self.assertEqual(2, len(read_data))
   self._assert_variants_equal([variant_1, variant_2], read_data)
Пример #6
0
 def test_end_info_key(self):
   end_info_header_line = (
       '##INFO=<ID=END,Number=1,Type=Integer,Description="End of record.">\n')
   record_lines = ['19	123	.	A	T	.	.	END=1111	GT	1/0	0/1\n',
                   '19	123	.	A	T	.	.	.	GT	0/1	1/1\n']
   variant_1 = Variant(
       reference_name='19', start=122, end=1111, reference_bases='A',
       alternate_bases=['T'])
   variant_1.calls.append(VariantCall(sample_id=hash_name('Sample1'),
                                      name='Sample1',
                                      genotype=[1, 0]))
   variant_1.calls.append(VariantCall(sample_id=hash_name('Sample2'),
                                      name='Sample2',
                                      genotype=[0, 1]))
   variant_2 = Variant(
       reference_name='19', start=122, end=123, reference_bases='A',
       alternate_bases=['T'])
   variant_2.calls.append(VariantCall(sample_id=hash_name('Sample1'),
                                      name='Sample1',
                                      genotype=[0, 1]))
   variant_2.calls.append(VariantCall(sample_id=hash_name('Sample2'),
                                      name='Sample2',
                                      genotype=[1, 1]))
   read_data = self._create_temp_file_and_read_records(
       [end_info_header_line] + _SAMPLE_HEADER_LINES[1:] + record_lines)
   self.assertEqual(2, len(read_data))
   self._assert_variants_equal([variant_1, variant_2], read_data)
  def test_missing_info_key(self):
    coder = self._get_coder()
    variant = Variant()
    variant.calls.append(VariantCall(
        name='Sample1', genotype=[0, 1], info={'GQ': 10, 'AF': 20}))
    variant.calls.append(VariantCall(
        name='Sample2', genotype=[0, 1], info={'AF': 20}))
    expected = ('.	.	.	.	.	.	.	.	GT:AF:GQ	0/1:20:10	'
                '0/1:20:.\n')

    self._assert_variant_lines_equal(coder.encode(variant), expected)
 def test_no_info(self):
   record_line = 'chr19	123	.	.	.	.	.	.	GT	.	.'
   expected_variant = Variant(reference_name='chr19', start=122, end=123)
   expected_variant.calls.append(
       VariantCall(name='Sample1', genotype=[vcfio.MISSING_GENOTYPE_VALUE]))
   expected_variant.calls.append(
       VariantCall(name='Sample2', genotype=[vcfio.MISSING_GENOTYPE_VALUE]))
   read_data = self._create_temp_file_and_read_records(
       _SAMPLE_HEADER_LINES + [record_line])
   self.assertEqual(1, len(read_data))
   self.assertEqual(expected_variant, read_data[0])
 def test_end_info_key_unknown_number(self):
   end_info_header_line = (
       '##INFO=<ID=END,Number=.,Type=Integer,Description="End of record.">\n')
   record_lines = ['19	123	.	A	.	.	.	END=1111	GT	1/0	0/1\n']
   variant_1 = Variant(
       reference_name='19', start=122, end=1111, reference_bases='A')
   variant_1.calls.append(VariantCall(name='Sample1', genotype=[1, 0]))
   variant_1.calls.append(VariantCall(name='Sample2', genotype=[0, 1]))
   read_data = self._create_temp_file_and_read_records(
       [end_info_header_line] + _SAMPLE_HEADER_LINES[1:] + record_lines)
   self.assertEqual(1, len(read_data))
   self._assert_variants_equal([variant_1], read_data)
Пример #10
0
    def test_use_of_representative_header_two_files(self):
        # Info field `HU` is defined as Float in file header while data is String.
        # This results in parser failure. We test if parser completes successfully
        # when a representative headers with String definition for field `HU` is
        # given.
        file_content_1 = [
            '##INFO=<ID=HU,Number=.,Type=Float,Descri\n',
            '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\r\n',
            '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSample1\r\n',
            '9\t2\t.\tA\tT\t.\t.\tHU=a,b\tGT\t0/0'
        ]
        file_content_2 = [
            '##INFO=<ID=HU,Number=.,Type=Float,Descri\n',
            '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\r\n',
            '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSample2\r\n',
            '19\t2\t.\tA\tT\t.\t.\tHU=a,b\tGT\t0/1\n',
        ]
        representative_header_lines = [
            '##INFO=<ID=HU,Number=.,Type=String,Description="Info">\n',
            '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\r\n',
        ]

        variant_1 = Variant(reference_name='9',
                            start=1,
                            end=2,
                            reference_bases='A',
                            alternate_bases=['T'],
                            info={'HU': ['a', 'b']})
        variant_1.calls.append(
            VariantCall(sample_id=hash_name('Sample1'), genotype=[0, 0]))

        variant_2 = Variant(reference_name='19',
                            start=1,
                            end=2,
                            reference_bases='A',
                            alternate_bases=['T'],
                            info={'HU': ['a', 'b']})
        variant_2.calls.append(
            VariantCall(sample_id=hash_name('Sample2'), genotype=[0, 1]))

        read_data_1 = self._create_temp_file_and_read_records(
            file_content_1, representative_header_lines)
        self.assertEqual(1, len(read_data_1))
        self._assert_variants_equal([variant_1], read_data_1)

        read_data_2 = self._create_temp_file_and_read_records(
            file_content_2, representative_header_lines)
        self.assertEqual(1, len(read_data_2))
        self._assert_variants_equal([variant_2], read_data_2)
Пример #11
0
    def test_triploid_genotype(self):
        coder = self._get_coder()
        variant = Variant()
        variant.calls.append(VariantCall(name='Sample', genotype=[1, 0, 1]))
        expected = '.	.	.	.	.	.	.	.	GT	1/0/1\n'

        self._assert_variant_lines_equal(coder.encode(variant), expected)
Пример #12
0
 def test_empty_sample_calls(self):
   coder = self._get_coder()
   variant = Variant()
   variant.calls.append(
       VariantCall(name='Sample2', genotype=-1))
   expected = '.	.	.	.	.	.	.	.	GT	.\n'
   self._assert_variant_lines_equal(coder.encode(variant), expected)
  def test_missing_genotype(self):
    coder = self._get_coder()
    variant = Variant()
    variant.calls.append(VariantCall(
        name='Sample', genotype=[1, vcfio.MISSING_GENOTYPE_VALUE]))
    expected = '.	.	.	.	.	.	.	.	GT	1/.\n'

    self._assert_variant_lines_equal(coder.encode(variant), expected)
  def test_info_list(self):
    coder = self._get_coder()
    variant = Variant()
    variant.calls.append(VariantCall(
        name='Sample', genotype=[0, 1], info={'LI': [1, None, 3]}))
    expected = '.	.	.	.	.	.	.	.	GT:LI	0/1:1,.,3\n'

    self._assert_variant_lines_equal(coder.encode(variant), expected)
Пример #15
0
    def test_variant_equality(self):
        base_variant = Variant(reference_name='a',
                               start=20,
                               end=22,
                               reference_bases='a',
                               alternate_bases=['g', 't'],
                               names=['variant'],
                               quality=9,
                               filters=['q10'],
                               info={'key': 'value'},
                               calls=[VariantCall(genotype=[0, 0])])
        equal_variant = Variant(reference_name='a',
                                start=20,
                                end=22,
                                reference_bases='a',
                                alternate_bases=['g', 't'],
                                names=['variant'],
                                quality=9,
                                filters=['q10'],
                                info={'key': 'value'},
                                calls=[VariantCall(genotype=[0, 0])])
        different_calls = Variant(reference_name='a',
                                  start=20,
                                  end=22,
                                  reference_bases='a',
                                  alternate_bases=['g', 't'],
                                  names=['variant'],
                                  quality=9,
                                  filters=['q10'],
                                  info={'key': 'value'},
                                  calls=[VariantCall(genotype=[1, 0])])
        missing_field = Variant(reference_name='a',
                                start=20,
                                end=22,
                                reference_bases='a',
                                alternate_bases=['g', 't'],
                                names=['variant'],
                                quality=9,
                                filters=['q10'],
                                info={'key': 'value'})

        self.assertEqual(base_variant, equal_variant)
        self.assertNotEqual(base_variant, different_calls)
        self.assertNotEqual(base_variant, missing_field)