def test_custom_phaseset(self): phaseset_header_line = ( '##FORMAT=<ID=PS,Number=1,Type=Integer,Description="Phaseset">\n') record_lines = [ '19 123 . A T . . . GT:PS 1|0:1111 0/1:.\n', '19 121 . A T . . . GT:PS 1|0:2222 0/1:2222\n' ] variant_1 = Variant(reference_name='19', start=122, end=123, reference_bases='A', alternate_bases=['T']) variant_1.calls.append( VariantCall(name='Sample1', genotype=[1, 0], phaseset='1111')) variant_1.calls.append(VariantCall(name='Sample2', genotype=[0, 1])) variant_2 = Variant(reference_name='19', start=120, end=121, reference_bases='A', alternate_bases=['T']) variant_2.calls.append( VariantCall(name='Sample1', genotype=[1, 0], phaseset='2222')) variant_2.calls.append( VariantCall(name='Sample2', genotype=[0, 1], phaseset='2222')) read_data = self._create_temp_file_and_read_records( [phaseset_header_line] + _SAMPLE_HEADER_LINES[1:] + record_lines) self.assertEqual(2, len(read_data)) self._assert_variants_equal([variant_1, variant_2], read_data)
def test_format_numbers(self): format_headers = [ '##FORMAT=<ID=FU,Number=.,Type=String,Description="Format_variable">\n', '##FORMAT=<ID=F1,Number=1,Type=Integer,Description="Format_one">\n', '##FORMAT=<ID=F2,Number=2,Type=Character,Description="Format_two">\n' ] record_lines = [ '19 2 . A T,C . . . GT:FU:F1:F2 1/0:a1:3:a,b 0/1:a2,a3:4:b,c\n' ] expected_variant = Variant(reference_name='19', start=1, end=2, reference_bases='A', alternate_bases=['T', 'C']) expected_variant.calls.append( VariantCall(name='Sample1', genotype=[1, 0], info={ 'FU': ['a1'], 'F1': 3, 'F2': ['a', 'b'] })) expected_variant.calls.append( VariantCall(name='Sample2', genotype=[0, 1], info={ 'FU': ['a2', 'a3'], 'F1': 4, 'F2': ['b', 'c'] })) read_data = self._create_temp_file_and_read_records( format_headers + _SAMPLE_HEADER_LINES[1:] + record_lines) self.assertEqual(1, len(read_data)) self.assertEqual(expected_variant, read_data[0])
def _get_sample_variant_3(self): """Get third sample variant. Features: symbolic alternate no calls for sample 2 """ vcf_line = ('19 12 . C <SYMBOLIC> 49 q10 AF=0.5 GT:GQ 0|1:45 .:.\n') variant = Variant( reference_name='19', start=11, end=12, reference_bases='C', alternate_bases=['<SYMBOLIC>'], quality=49, filters=['q10'], info={'AF': VariantInfo(data=[0.5], field_count='A')}) variant.calls.append( VariantCall(name='Sample1', genotype=[0, 1], phaseset=DEFAULT_PHASESET_VALUE, info={'GQ': 45})) variant.calls.append( VariantCall(name='Sample2', genotype=[MISSING_GENOTYPE_VALUE], info={'GQ': None})) return variant, vcf_line
def _get_sample_variant_2(self): """Get second sample variant. Features: multiple references no alternate phased multiple filters missing format field """ vcf_line = ('19 123 rs1234 GTC . 40 q10;s50 NS=2 GT:GQ 1|0:48 0/1:.\n') variant = Variant(reference_name='19', start=122, end=125, reference_bases='GTC', alternate_bases=[], names=['rs1234'], quality=40, filters=['q10', 's50'], info={'NS': VariantInfo(data=2, field_count='1')}) variant.calls.append( VariantCall(name='Sample1', genotype=[1, 0], phaseset=DEFAULT_PHASESET_VALUE, info={'GQ': 48})) variant.calls.append( VariantCall(name='Sample2', genotype=[0, 1], info={'GQ': None})) return variant, vcf_line
def _get_sample_variant_1(self): """Get first sample variant. Features: multiple alternates not phased multiple names """ vcf_line = ('20 1234 rs123;rs2 C A,T 50 PASS AF=0.5,0.1;NS=1 ' 'GT:GQ 0/0:48 1/0:20\n') variant = Variant(reference_name='20', start=1233, end=1234, reference_bases='C', alternate_bases=['A', 'T'], names=['rs123', 'rs2'], quality=50, filters=['PASS'], info={ 'AF': VariantInfo(data=[0.5, 0.1], field_count='A'), 'NS': VariantInfo(data=1, field_count='1') }) variant.calls.append( VariantCall(name='Sample1', genotype=[0, 0], info={'GQ': 48})) variant.calls.append( VariantCall(name='Sample2', genotype=[1, 0], info={'GQ': 20})) return variant, vcf_line
def test_info_numbers_and_types(self): info_headers = [ '##INFO=<ID=HA,Number=A,Type=String,Description="StringInfo_A">\n', '##INFO=<ID=HG,Number=G,Type=Integer,Description="IntInfo_G">\n', '##INFO=<ID=HR,Number=R,Type=Character,Description="ChrInfo_R">\n', '##INFO=<ID=HF,Number=0,Type=Flag,Description="FlagInfo">\n', '##INFO=<ID=HU,Number=.,Type=Float,Description="FloatInfo_variable">\n'] record_lines = [ '19 2 . A T,C . . HA=a1,a2;HG=1,2,3;HR=a,b,c;HF;HU=0.1 GT 1/0 0/1\n', '19 124 . A T . . HG=3,4,5;HR=d,e;HU=1.1,1.2 GT 0/0 0/1'] variant_1 = Variant( reference_name='19', start=1, end=2, reference_bases='A', alternate_bases=['T', 'C'], info={'HA': VariantInfo(data=['a1', 'a2'], field_count='A'), 'HG': VariantInfo(data=[1, 2, 3], field_count='G'), 'HR': VariantInfo(data=['a', 'b', 'c'], field_count='R'), 'HF': VariantInfo(data=True, field_count='0'), 'HU': VariantInfo(data=[0.1], field_count=None)}) variant_1.calls.append(VariantCall(name='Sample1', genotype=[1, 0])) variant_1.calls.append(VariantCall(name='Sample2', genotype=[0, 1])) variant_2 = Variant( reference_name='19', start=123, end=124, reference_bases='A', alternate_bases=['T'], info={'HG': VariantInfo(data=[3, 4, 5], field_count='G'), 'HR': VariantInfo(data=['d', 'e'], field_count='R'), 'HU': VariantInfo(data=[1.1, 1.2], field_count=None)}) variant_2.calls.append(VariantCall(name='Sample1', genotype=[0, 0])) variant_2.calls.append(VariantCall(name='Sample2', genotype=[0, 1])) read_data = self._create_temp_file_and_read_records( info_headers + _SAMPLE_HEADER_LINES[1:] + record_lines) self.assertEqual(2, len(read_data)) self._assert_variants_equal([variant_1, variant_2], read_data)
def test_no_info(self): record_line = 'chr19 123 . . . . . . GT . .' expected_variant = Variant(reference_name='chr19', start=122, end=123) expected_variant.calls.append( VariantCall(name='Sample1', genotype=[MISSING_GENOTYPE_VALUE])) expected_variant.calls.append( VariantCall(name='Sample2', genotype=[MISSING_GENOTYPE_VALUE])) read_data = self._create_temp_file_and_read_records( _SAMPLE_HEADER_LINES + [record_line]) self.assertEqual(1, len(read_data)) self.assertEqual(expected_variant, read_data[0])
def test_end_info_key(self): phaseset_header_line = ( '##INFO=<ID=END,Number=1,Type=Integer,Description="End of record.">\n') record_lines = ['19 123 . A . . . END=1111 GT 1/0 0/1\n', '19 123 . A . . . . GT 0/1 1/1\n'] variant_1 = Variant( reference_name='19', start=122, end=1111, reference_bases='A') variant_1.calls.append(VariantCall(name='Sample1', genotype=[1, 0])) variant_1.calls.append(VariantCall(name='Sample2', genotype=[0, 1])) variant_2 = Variant( reference_name='19', start=122, end=123, reference_bases='A') variant_2.calls.append(VariantCall(name='Sample1', genotype=[0, 1])) variant_2.calls.append(VariantCall(name='Sample2', genotype=[1, 1])) read_data = self._create_temp_file_and_read_records( [phaseset_header_line] + _SAMPLE_HEADER_LINES[1:] + record_lines) self.assertEqual(2, len(read_data)) self._assert_variants_equal([variant_1, variant_2], read_data)
def test_variant_equality(self): base_variant = Variant(reference_name='a', start=20, end=22, reference_bases='a', alternate_bases=['g', 't'], names=['variant'], quality=9, filters=['q10'], info={'key': 'value'}, calls=[VariantCall(genotype=[0, 0])]) equal_variant = Variant(reference_name='a', start=20, end=22, reference_bases='a', alternate_bases=['g', 't'], names=['variant'], quality=9, filters=['q10'], info={'key': 'value'}, calls=[VariantCall(genotype=[0, 0])]) different_calls = Variant(reference_name='a', start=20, end=22, reference_bases='a', alternate_bases=['g', 't'], names=['variant'], quality=9, filters=['q10'], info={'key': 'value'}, calls=[VariantCall(genotype=[1, 0])]) missing_field = Variant(reference_name='a', start=20, end=22, reference_bases='a', alternate_bases=['g', 't'], names=['variant'], quality=9, filters=['q10'], info={'key': 'value'}) self.assertEqual(base_variant, equal_variant) self.assertNotEqual(base_variant, different_calls) self.assertNotEqual(base_variant, missing_field)