Exemplo n.º 1
0
 def test_custom_phaseset(self):
     phaseset_header_line = (
         '##FORMAT=<ID=PS,Number=1,Type=Integer,Description="Phaseset">\n')
     record_lines = [
         '19	123	.	A	T	.	.	.	GT:PS	1|0:1111	0/1:.\n',
         '19	121	.	A	T	.	.	.	GT:PS	1|0:2222	0/1:2222\n'
     ]
     variant_1 = Variant(reference_name='19',
                         start=122,
                         end=123,
                         reference_bases='A',
                         alternate_bases=['T'])
     variant_1.calls.append(
         VariantCall(name='Sample1', genotype=[1, 0], phaseset='1111'))
     variant_1.calls.append(VariantCall(name='Sample2', genotype=[0, 1]))
     variant_2 = Variant(reference_name='19',
                         start=120,
                         end=121,
                         reference_bases='A',
                         alternate_bases=['T'])
     variant_2.calls.append(
         VariantCall(name='Sample1', genotype=[1, 0], phaseset='2222'))
     variant_2.calls.append(
         VariantCall(name='Sample2', genotype=[0, 1], phaseset='2222'))
     read_data = self._create_temp_file_and_read_records(
         [phaseset_header_line] + _SAMPLE_HEADER_LINES[1:] + record_lines)
     self.assertEqual(2, len(read_data))
     self._assert_variants_equal([variant_1, variant_2], read_data)
Exemplo n.º 2
0
 def test_info_numbers_and_types(self):
   info_headers = [
       '##INFO=<ID=HA,Number=A,Type=String,Description="StringInfo_A">\n',
       '##INFO=<ID=HG,Number=G,Type=Integer,Description="IntInfo_G">\n',
       '##INFO=<ID=HR,Number=R,Type=Character,Description="ChrInfo_R">\n',
       '##INFO=<ID=HF,Number=0,Type=Flag,Description="FlagInfo">\n',
       '##INFO=<ID=HU,Number=.,Type=Float,Description="FloatInfo_variable">\n']
   record_lines = [
       '19	2	.	A	T,C	.	.	HA=a1,a2;HG=1,2,3;HR=a,b,c;HF;HU=0.1	GT	1/0	0/1\n',
       '19	124	.	A	T	.	.	HG=3,4,5;HR=d,e;HU=1.1,1.2	GT	0/0	0/1']
   variant_1 = Variant(
       reference_name='19', start=1, end=2, reference_bases='A',
       alternate_bases=['T', 'C'],
       info={'HA': VariantInfo(data=['a1', 'a2'], field_count='A'),
             'HG': VariantInfo(data=[1, 2, 3], field_count='G'),
             'HR': VariantInfo(data=['a', 'b', 'c'], field_count='R'),
             'HF': VariantInfo(data=True, field_count='0'),
             'HU': VariantInfo(data=[0.1], field_count=None)})
   variant_1.calls.append(VariantCall(name='Sample1', genotype=[1, 0]))
   variant_1.calls.append(VariantCall(name='Sample2', genotype=[0, 1]))
   variant_2 = Variant(
       reference_name='19', start=123, end=124, reference_bases='A',
       alternate_bases=['T'],
       info={'HG': VariantInfo(data=[3, 4, 5], field_count='G'),
             'HR': VariantInfo(data=['d', 'e'], field_count='R'),
             'HU': VariantInfo(data=[1.1, 1.2], field_count=None)})
   variant_2.calls.append(VariantCall(name='Sample1', genotype=[0, 0]))
   variant_2.calls.append(VariantCall(name='Sample2', genotype=[0, 1]))
   read_data = self._create_temp_file_and_read_records(
       info_headers + _SAMPLE_HEADER_LINES[1:] + record_lines)
   self.assertEqual(2, len(read_data))
   self._assert_variants_equal([variant_1, variant_2], read_data)
Exemplo n.º 3
0
  def test_sort_variants(self):
    sorted_variants = [
        Variant(reference_name='a', start=20, end=22),
        Variant(reference_name='a', start=20, end=22, quality=20),
        Variant(reference_name='b', start=20, end=22),
        Variant(reference_name='b', start=21, end=22),
        Variant(reference_name='b', start=21, end=23)]

    for permutation in permutations(sorted_variants):
      self.assertEqual(sorted(permutation), sorted_variants)
Exemplo n.º 4
0
 def test_format_numbers(self):
     format_headers = [
         '##FORMAT=<ID=FU,Number=.,Type=String,Description="Format_variable">\n',
         '##FORMAT=<ID=F1,Number=1,Type=Integer,Description="Format_one">\n',
         '##FORMAT=<ID=F2,Number=2,Type=Character,Description="Format_two">\n'
     ]
     record_lines = [
         '19	2	.	A	T,C	.	.	.	GT:FU:F1:F2	1/0:a1:3:a,b	0/1:a2,a3:4:b,c\n'
     ]
     expected_variant = Variant(reference_name='19',
                                start=1,
                                end=2,
                                reference_bases='A',
                                alternate_bases=['T', 'C'])
     expected_variant.calls.append(
         VariantCall(name='Sample1',
                     genotype=[1, 0],
                     info={
                         'FU': ['a1'],
                         'F1': 3,
                         'F2': ['a', 'b']
                     }))
     expected_variant.calls.append(
         VariantCall(name='Sample2',
                     genotype=[0, 1],
                     info={
                         'FU': ['a2', 'a3'],
                         'F1': 4,
                         'F2': ['b', 'c']
                     }))
     read_data = self._create_temp_file_and_read_records(
         format_headers + _SAMPLE_HEADER_LINES[1:] + record_lines)
     self.assertEqual(1, len(read_data))
     self.assertEqual(expected_variant, read_data[0])
Exemplo n.º 5
0
    def _get_sample_variant_3(self):
        """Get third sample variant.

    Features:
      symbolic alternate
      no calls for sample 2
    """
        vcf_line = ('19	12	.	C	<SYMBOLIC>	49	q10	AF=0.5	GT:GQ	0|1:45 .:.\n')
        variant = Variant(
            reference_name='19',
            start=11,
            end=12,
            reference_bases='C',
            alternate_bases=['<SYMBOLIC>'],
            quality=49,
            filters=['q10'],
            info={'AF': VariantInfo(data=[0.5], field_count='A')})
        variant.calls.append(
            VariantCall(name='Sample1',
                        genotype=[0, 1],
                        phaseset=DEFAULT_PHASESET_VALUE,
                        info={'GQ': 45}))
        variant.calls.append(
            VariantCall(name='Sample2',
                        genotype=[MISSING_GENOTYPE_VALUE],
                        info={'GQ': None}))
        return variant, vcf_line
Exemplo n.º 6
0
    def _get_sample_variant_2(self):
        """Get second sample variant.

    Features:
      multiple references
      no alternate
      phased
      multiple filters
      missing format field
    """
        vcf_line = ('19	123	rs1234	GTC	.	40	q10;s50	NS=2	GT:GQ	1|0:48	0/1:.\n')
        variant = Variant(reference_name='19',
                          start=122,
                          end=125,
                          reference_bases='GTC',
                          alternate_bases=[],
                          names=['rs1234'],
                          quality=40,
                          filters=['q10', 's50'],
                          info={'NS': VariantInfo(data=2, field_count='1')})
        variant.calls.append(
            VariantCall(name='Sample1',
                        genotype=[1, 0],
                        phaseset=DEFAULT_PHASESET_VALUE,
                        info={'GQ': 48}))
        variant.calls.append(
            VariantCall(name='Sample2', genotype=[0, 1], info={'GQ': None}))
        return variant, vcf_line
Exemplo n.º 7
0
    def _get_sample_variant_1(self):
        """Get first sample variant.

    Features:
      multiple alternates
      not phased
      multiple names
    """
        vcf_line = ('20	1234	rs123;rs2	C	A,T	50	PASS	AF=0.5,0.1;NS=1	'
                    'GT:GQ	0/0:48	1/0:20\n')
        variant = Variant(reference_name='20',
                          start=1233,
                          end=1234,
                          reference_bases='C',
                          alternate_bases=['A', 'T'],
                          names=['rs123', 'rs2'],
                          quality=50,
                          filters=['PASS'],
                          info={
                              'AF': VariantInfo(data=[0.5, 0.1],
                                                field_count='A'),
                              'NS': VariantInfo(data=1, field_count='1')
                          })
        variant.calls.append(
            VariantCall(name='Sample1', genotype=[0, 0], info={'GQ': 48}))
        variant.calls.append(
            VariantCall(name='Sample2', genotype=[1, 0], info={'GQ': 20}))
        return variant, vcf_line
Exemplo n.º 8
0
 def test_end_info_key(self):
   phaseset_header_line = (
       '##INFO=<ID=END,Number=1,Type=Integer,Description="End of record.">\n')
   record_lines = ['19	123	.	A	.	.	.	END=1111	GT	1/0	0/1\n',
                   '19	123	.	A	.	.	.	.	GT	0/1	1/1\n']
   variant_1 = Variant(
       reference_name='19', start=122, end=1111, reference_bases='A')
   variant_1.calls.append(VariantCall(name='Sample1', genotype=[1, 0]))
   variant_1.calls.append(VariantCall(name='Sample2', genotype=[0, 1]))
   variant_2 = Variant(
       reference_name='19', start=122, end=123, reference_bases='A')
   variant_2.calls.append(VariantCall(name='Sample1', genotype=[0, 1]))
   variant_2.calls.append(VariantCall(name='Sample2', genotype=[1, 1]))
   read_data = self._create_temp_file_and_read_records(
       [phaseset_header_line] + _SAMPLE_HEADER_LINES[1:] + record_lines)
   self.assertEqual(2, len(read_data))
   self._assert_variants_equal([variant_1, variant_2], read_data)
Exemplo n.º 9
0
    def test_variant_equality(self):
        base_variant = Variant(reference_name='a',
                               start=20,
                               end=22,
                               reference_bases='a',
                               alternate_bases=['g', 't'],
                               names=['variant'],
                               quality=9,
                               filters=['q10'],
                               info={'key': 'value'},
                               calls=[VariantCall(genotype=[0, 0])])
        equal_variant = Variant(reference_name='a',
                                start=20,
                                end=22,
                                reference_bases='a',
                                alternate_bases=['g', 't'],
                                names=['variant'],
                                quality=9,
                                filters=['q10'],
                                info={'key': 'value'},
                                calls=[VariantCall(genotype=[0, 0])])
        different_calls = Variant(reference_name='a',
                                  start=20,
                                  end=22,
                                  reference_bases='a',
                                  alternate_bases=['g', 't'],
                                  names=['variant'],
                                  quality=9,
                                  filters=['q10'],
                                  info={'key': 'value'},
                                  calls=[VariantCall(genotype=[1, 0])])
        missing_field = Variant(reference_name='a',
                                start=20,
                                end=22,
                                reference_bases='a',
                                alternate_bases=['g', 't'],
                                names=['variant'],
                                quality=9,
                                filters=['q10'],
                                info={'key': 'value'})

        self.assertEqual(base_variant, equal_variant)
        self.assertNotEqual(base_variant, different_calls)
        self.assertNotEqual(base_variant, missing_field)
Exemplo n.º 10
0
 def test_no_info(self):
     record_line = 'chr19	123	.	.	.	.	.	.	GT	.	.'
     expected_variant = Variant(reference_name='chr19', start=122, end=123)
     expected_variant.calls.append(
         VariantCall(name='Sample1', genotype=[MISSING_GENOTYPE_VALUE]))
     expected_variant.calls.append(
         VariantCall(name='Sample2', genotype=[MISSING_GENOTYPE_VALUE]))
     read_data = self._create_temp_file_and_read_records(
         _SAMPLE_HEADER_LINES + [record_line])
     self.assertEqual(1, len(read_data))
     self.assertEqual(expected_variant, read_data[0])
Exemplo n.º 11
0
 def test_no_samples(self):
   header_line = '#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO\n'
   record_line = '19	123	.	G	A	.	PASS	AF=0.2'
   expected_variant = Variant(
       reference_name='19', start=122, end=123, reference_bases='G',
       alternate_bases=['A'], filters=['PASS'],
       info={'AF': VariantInfo(data=[0.2], field_count='A')})
   read_data = self._create_temp_file_and_read_records(
       _SAMPLE_HEADER_LINES[:-1] + [header_line, record_line])
   self.assertEqual(1, len(read_data))
   self.assertEqual(expected_variant, read_data[0])