def setUp(self): self.converter = VcfToBedpeConverter() header_lines = [ '##fileformat=VCFv4.2', '##fileDate=20090805', '##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta', '##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">', '##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">', '##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant">', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">', '#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001' ] self.vcf = Vcf() self.vcf.add_header(header_lines)
def setUp(self): self.converter = VcfToBedpeConverter() header_lines = [ '##fileformat=VCFv4.2', '##fileDate=20090805', '##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta', '##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">', '##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">', '##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant">', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">', '#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001'] self.vcf = Vcf() self.vcf.add_header(header_lines)
def vcfToBedpe(vcf_file, bedpe_out): converter = VcfToBedpeConverter() vcf = svtools.vcf.file.Vcf() in_header = True header = [] sample_list = [] bnds = dict() sec_bnds = dict() v = [] for line in vcf_file: if in_header: if line[0:2] == '##': if line.split('=')[0] == '##fileformat': line = '##fileformat=' + "BEDPE" + '\n' if line.split('=')[0] == '##fileDate': line = '##fileDate=' + time.strftime('%Y%m%d') + '\n' header.append(line) continue elif line[0] == '#' and line[1] != '#': sample_list = line.rstrip().split('\t')[9:] header.append(line) continue else: # print header in_header = False vcf.add_header(header) if "SVTYPE" in [info.id for info in vcf.info_list]: vcf.add_info_after("SVTYPE", "POS", 1, 'Integer', 'Position of the variant described in this record') header=vcf.get_header() bedpe_out.write(header[:header.rfind('\n')] + '\n') final_header_line = ['#CHROM_A', 'START_A', 'END_A', 'CHROM_B', 'START_B', 'END_B', 'ID', 'QUAL', 'STRAND_A', 'STRAND_B', 'TYPE', 'FILTER', 'NAME_A', 'REF_A', 'ALT_A', 'NAME_B', 'REF_B', 'ALT_B', 'INFO_A', 'INFO_B'] if len(sample_list) > 0: bedpe_out.write('\t'.join(final_header_line + ['FORMAT','\t'.join(map(str,sample_list))]) + '\n') else: bedpe_out.write('\t'.join(final_header_line) + '\n') v = line.rstrip().split('\t') var = svtools.vcf.variant.Variant(v, vcf) var.set_info("POS", var.pos) unique_name = var.var_id # The EVENT ID is not unique. Stick with ID column. # if 'EVENT' in var.info: # unique_name = var.info['EVENT'] if var.info['SVTYPE'] != 'BND': bedpe_out.write(str(converter.convert(var)) + '\n') else: # Manta doesn't use the SECONDARY flag # So, just check whether the mate ID is in bnds # Which means that the mate has already been seen if 'MATEID' in var.info: mate_id = var.info['MATEID'] if mate_id in bnds: var1 = bnds[mate_id] bedpe_out.write(str(converter.convert(var1, var)) + '\n') del bnds[mate_id] else: bnds.update({unique_name:var}) else: bnds.update({unique_name:var}) continue intersected_keys = bnds.viewkeys() & sec_bnds.viewkeys() for key in intersected_keys: bedpe_out.write(str(converter.convert(bnds[key], sec_bnds[key])) + '\n') del bnds[key] del sec_bnds[key] if bnds is not None: for bnd in bnds: sys.stderr.write('Warning: missing secondary multiline variant at ID:' + bnd + '\n') bedpe_out.write(str(converter.convert(bnds[bnd], None)) + '\n') if sec_bnds is not None: for bnd in sec_bnds: sys.stderr.write('Warning: missing primary multiline variant at ID:' + bnd + '\n') bedpe_out.write(str(converter.convert(None, sec_bnds[bnd])) + '\n') # close the files bedpe_out.close() return
class TestVcfToBedpeConverter(TestCase): def setUp(self): self.converter = VcfToBedpeConverter() header_lines = [ '##fileformat=VCFv4.2', '##fileDate=20090805', '##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta', '##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">', '##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">', '##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant">', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">', '#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001' ] self.vcf = Vcf() self.vcf.add_header(header_lines) def test_bnd_alt_string(self): self.assertEqual(self.converter.parse_bnd_alt_string('A[1:6['), ('[', '1', 6)) self.assertEqual(self.converter.parse_bnd_alt_string('A]1:6]'), (']', '1', 6)) self.assertEqual(self.converter.parse_bnd_alt_string(']1:6]A'), (']', '1', 6)) with self.assertRaises(AssertionError): self.converter.parse_bnd_alt_string(']1:6[A') with self.assertRaises(AssertionError): self.converter.parse_bnd_alt_string('1') def test_bnd_breakpoints(self): vcf_array1 = [ '1', '20000', '235', 'T', 'A[1:6[', '0.00', '.', '.', 'GT', '0/0' ] v1 = Variant(vcf_array1, self.vcf) self.assertEqual(self.converter.bnd_breakpoints(v1), ('1', 20000, 20000, '1', 5, 5, '+', '-')) vcf_array2 = [ '1', '20000', '235', 'T', ']1:6]N', '0.00', '.', '.', 'GT', '0/0' ] v2 = Variant(vcf_array2, self.vcf) self.assertEqual(self.converter.bnd_breakpoints(v2), ('1', 19999, 19999, '1', 6, 6, '-', '+')) def test_simple_breakpoints(self): vcf_array1 = [ '1', '20000', '235', 'T', '<DEL>', '0.00', '.', 'END=20500', 'GT', '0/0' ] v1 = Variant(vcf_array1, self.vcf) self.assertEqual(self.converter.simple_breakpoints(v1), ('1', 20000, 20000, '1', 20500, 20500, '+', '-')) vcf_array2 = [ '1', '20000', '235', 'T', '<DEL>', '0.00', '.', 'END=20500;STRANDS=-+:2', 'GT', '0/0' ] v2 = Variant(vcf_array2, self.vcf) self.assertEqual(self.converter.simple_breakpoints(v2), ('1', 20000, 20000, '1', 20500, 20500, '-', '+')) vcf_array3 = [ '1', '20000', '235', 'T', '<DEL>', '0.00', '.', 'STRANDS=--:2', 'GT', '0/0' ] v3 = Variant(vcf_array3, self.vcf) with self.assertRaises(ValueError): self.converter.simple_breakpoints(v3) def test_adjust_coordinate(self): vcf_array1 = [ '1', '20000', '235', 'T', '<DEL>', '0.00', '.', 'CIEND=-50,50', 'GT', '0/0' ] v1 = Variant(vcf_array1, self.vcf) self.assertEqual( self.converter.adjust_coordinate(v1, 'CIEND', 500, 1000), (450, 1050)) self.assertEqual( self.converter.adjust_coordinate(v1, 'CIPOS', 500, 1000), (500, 1000)) vcf_array2 = [ '1', '20000', '235', 'T', '<DEL>', '0.00', '.', 'CIEND=50', 'GT', '0/0' ] v2 = Variant(vcf_array2, self.vcf) with self.assertRaises(ValueError): self.converter.adjust_coordinate(v2, 'CIEND', 500, 1000)
class TestVcfToBedpeConverter(TestCase): def setUp(self): self.converter = VcfToBedpeConverter() header_lines = [ '##fileformat=VCFv4.2', '##fileDate=20090805', '##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta', '##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">', '##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">', '##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant">', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">', '#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001'] self.vcf = Vcf() self.vcf.add_header(header_lines) def test_bnd_alt_string(self): self.assertEqual(self.converter.parse_bnd_alt_string('A[1:6['), ('[', '1', 6)) self.assertEqual(self.converter.parse_bnd_alt_string('A]1:6]'), (']', '1', 6)) self.assertEqual(self.converter.parse_bnd_alt_string(']1:6]A'), (']', '1', 6)) with self.assertRaises(AssertionError): self.converter.parse_bnd_alt_string(']1:6[A') with self.assertRaises(AssertionError): self.converter.parse_bnd_alt_string('1') def test_bnd_breakpoints(self): vcf_array1 = ['1', '20000', '235', 'T', 'A[1:6[', '0.00', '.', '.', 'GT', '0/0'] v1 = Variant(vcf_array1, self.vcf) self.assertEqual( self.converter.bnd_breakpoints(v1), ('1', 20000, 20000, '1', 5, 5, '+', '-')) vcf_array2 = ['1', '20000', '235', 'T', ']1:6]N', '0.00', '.', '.', 'GT', '0/0'] v2 = Variant(vcf_array2, self.vcf) self.assertEqual( self.converter.bnd_breakpoints(v2), ('1', 19999, 19999, '1', 6, 6, '-', '+')) def test_simple_breakpoints(self): vcf_array1 = ['1', '20000', '235', 'T', '<DEL>', '0.00', '.', 'END=20500', 'GT', '0/0'] v1 = Variant(vcf_array1, self.vcf) self.assertEqual( self.converter.simple_breakpoints(v1), ('1', 20000, 20000, '1', 20500, 20500, '+', '-')) vcf_array2 = ['1', '20000', '235', 'T', '<DEL>', '0.00', '.', 'END=20500;STRANDS=-+:2', 'GT', '0/0'] v2 = Variant(vcf_array2, self.vcf) self.assertEqual( self.converter.simple_breakpoints(v2), ('1', 20000, 20000, '1', 20500, 20500, '-', '+')) vcf_array3 = ['1', '20000', '235', 'T', '<DEL>', '0.00', '.', 'STRANDS=--:2', 'GT', '0/0'] v3 = Variant(vcf_array3, self.vcf) with self.assertRaises(ValueError): self.converter.simple_breakpoints(v3) def test_adjust_coordinate(self): vcf_array1 = ['1', '20000', '235', 'T', '<DEL>', '0.00', '.', 'CIEND=-50,50', 'GT', '0/0'] v1 = Variant(vcf_array1, self.vcf) self.assertEqual( self.converter.adjust_coordinate(v1, 'CIEND', 500, 1000), (450, 1050)) self.assertEqual( self.converter.adjust_coordinate(v1, 'CIPOS', 500, 1000), (500, 1000)) vcf_array2 = ['1', '20000', '235', 'T', '<DEL>', '0.00', '.', 'CIEND=50', 'GT', '0/0'] v2 = Variant(vcf_array2, self.vcf) with self.assertRaises(ValueError): self.converter.adjust_coordinate(v2, 'CIEND', 500, 1000)
def vcfToBedpe(vcf_file, bedpe_out): converter = VcfToBedpeConverter() vcf = svtools.vcf.file.Vcf() in_header = True header = [] sample_list = [] bnds = dict() sec_bnds = dict() v = [] for line in vcf_file: if in_header: if line[0:2] == '##': if line.split('=')[0] == '##fileformat': line = '##fileformat=' + "BEDPE" + '\n' if line.split('=')[0] == '##fileDate': line = '##fileDate=' + time.strftime('%Y%m%d') + '\n' header.append(line) continue elif line[0] == '#' and line[1] != '#': sample_list = line.rstrip().split('\t')[9:] header.append(line) continue else: # print header in_header = False vcf.add_header(header) if "SVTYPE" in [info.id for info in vcf.info_list]: vcf.add_info_after("SVTYPE", "POS", 1, 'Integer', 'Position of the variant described in this record') header=vcf.get_header() bedpe_out.write(header[:header.rfind('\n')] + '\n') final_header_line = ['#CHROM_A', 'START_A', 'END_A', 'CHROM_B', 'START_B', 'END_B', 'ID', 'QUAL', 'STRAND_A', 'STRAND_B', 'TYPE', 'FILTER', 'NAME_A', 'REF_A', 'ALT_A', 'NAME_B', 'REF_B', 'ALT_B', 'INFO_A', 'INFO_B'] if len(sample_list) > 0: bedpe_out.write('\t'.join(final_header_line + ['FORMAT','\t'.join(map(str,sample_list))]) + '\n') else: bedpe_out.write('\t'.join(final_header_line) + '\n') v = line.rstrip().split('\t') var = svtools.vcf.variant.Variant(v, vcf) var.set_info("POS", var.pos) # If there is no MATEID then assume this is a single-ended BND and simply output if var.info['SVTYPE'] != 'BND' or 'MATEID' not in var.info: bedpe_out.write(str(converter.convert(var)) + '\n') else: mate_id = var.info['MATEID'] if 'SECONDARY' in var.info: if mate_id in bnds: #primary var1 = bnds[mate_id] bedpe_out.write(str(converter.convert(var1, var)) + '\n') del bnds[mate_id] else: sec_bnds.update({var.var_id:var}) else: if mate_id in sec_bnds: var2 = sec_bnds[mate_id] bedpe_out.write(str(converter.convert(var, var2)) + '\n') del sec_bnds[mate_id] else: bnds.update({var.var_id:var}) if bnds is not None: for bnd in bnds: sys.stderr.write('Warning: missing secondary multiline variant at ID:' + bnd + '\n') bedpe_out.write(str(converter.convert(bnds[bnd], None)) + '\n') if sec_bnds is not None: for bnd in sec_bnds: sys.stderr.write('Warning: missing primary multiline variant at ID:' + bnd + '\n') bedpe_out.write(str(converter.convert(None, sec_bnds[bnd])) + '\n') # close the files bedpe_out.close() return
def vcfToBedpe(vcf_file, bedpe_out): converter = VcfToBedpeConverter() vcf = svtools.vcf.file.Vcf() in_header = True header = [] sample_list = [] bnds = dict() sec_bnds = dict() v = [] for line in vcf_file: if in_header: if line[0:2] == '##': if line.split('=')[0] == '##fileformat': line = '##fileformat=' + "BEDPE" + '\n' if line.split('=')[0] == '##fileDate': line = '##fileDate=' + time.strftime('%Y%m%d') + '\n' header.append(line) continue elif line[0] == '#' and line[1] != '#': sample_list = line.rstrip().split('\t')[9:] header.append(line) continue else: # print header in_header = False vcf.add_header(header) if "SVTYPE" in [info.id for info in vcf.info_list]: vcf.add_info_after( "SVTYPE", "POS", 1, 'Integer', 'Position of the variant described in this record') header = vcf.get_header() bedpe_out.write(header[:header.rfind('\n')] + '\n') final_header_line = [ '#CHROM_A', 'START_A', 'END_A', 'CHROM_B', 'START_B', 'END_B', 'ID', 'QUAL', 'STRAND_A', 'STRAND_B', 'TYPE', 'FILTER', 'NAME_A', 'REF_A', 'ALT_A', 'NAME_B', 'REF_B', 'ALT_B', 'INFO_A', 'INFO_B' ] if len(sample_list) > 0: bedpe_out.write('\t'.join( final_header_line + ['FORMAT', '\t'.join(map(str, sample_list))]) + '\n') else: bedpe_out.write('\t'.join(final_header_line) + '\n') v = line.rstrip().split('\t') var = svtools.vcf.variant.Variant(v, vcf) var.set_info("POS", var.pos) # If there is no MATEID then assume this is a single-ended BND and simply output if var.info['SVTYPE'] != 'BND' or 'MATEID' not in var.info: bedpe_out.write(str(converter.convert(var)) + '\n') else: mate_id = var.info['MATEID'] if 'SECONDARY' in var.info: if mate_id in bnds: #primary var1 = bnds[mate_id] bedpe_out.write(str(converter.convert(var1, var)) + '\n') del bnds[mate_id] else: sec_bnds.update({var.var_id: var}) else: if mate_id in sec_bnds: var2 = sec_bnds[mate_id] bedpe_out.write(str(converter.convert(var, var2)) + '\n') del sec_bnds[mate_id] elif mate_id in bnds: var1 = bnds[mate_id] bedpe_out.write(str(converter.convert(var1, var)) + '\n') del bnds[mate_id] else: bnds.update({var.var_id: var}) if bnds is not None: for bnd in bnds: sys.stderr.write( 'Warning: missing secondary multiline variant at ID:' + bnd + '\n') bedpe_out.write(str(converter.convert(bnds[bnd], None)) + '\n') if sec_bnds is not None: for bnd in sec_bnds: sys.stderr.write( 'Warning: missing primary multiline variant at ID:' + bnd + '\n') bedpe_out.write(str(converter.convert(None, sec_bnds[bnd])) + '\n') # close the files bedpe_out.close() return
def vcfToBedpe(vcf_file, bedpe_out): converter = VcfToBedpeConverter() vcf = svtools.vcf.file.Vcf() in_header = True header = [] sample_list = [] bnds = dict() sec_bnds = dict() v = [] for line in vcf_file: if in_header: if line[0:2] == '##': if line.split('=')[0] == '##fileformat': line = '##fileformat=' + "BEDPE" + '\n' if line.split('=')[0] == '##fileDate': line = '##fileDate=' + time.strftime('%Y%m%d') + '\n' header.append(line) continue elif line[0] == '#' and line[1] != '#': sample_list = line.rstrip().split('\t')[9:] header.append(line) continue else: # print header in_header = False vcf.add_header(header) if "SVTYPE" in [info.id for info in vcf.info_list]: vcf.add_info_after( "SVTYPE", "POS", 1, 'Integer', 'Position of the variant described in this record') header = vcf.get_header() bedpe_out.write(header[:header.rfind('\n')] + '\n') final_header_line = [ '#CHROM_A', 'START_A', 'END_A', 'CHROM_B', 'START_B', 'END_B', 'ID', 'QUAL', 'STRAND_A', 'STRAND_B', 'TYPE', 'FILTER', 'NAME_A', 'REF_A', 'ALT_A', 'NAME_B', 'REF_B', 'ALT_B', 'INFO_A', 'INFO_B' ] if len(sample_list) > 0: bedpe_out.write('\t'.join( final_header_line + ['FORMAT', '\t'.join(map(str, sample_list))]) + '\n') else: bedpe_out.write('\t'.join(final_header_line) + '\n') v = line.rstrip().split('\t') var = svtools.vcf.variant.Variant(v, vcf) var.set_info("POS", var.pos) unique_name = var.var_id # The EVENT ID is not unique. Stick with ID column. # if 'EVENT' in var.info: # unique_name = var.info['EVENT'] if var.info['SVTYPE'] != 'BND': bedpe_out.write(str(converter.convert(var)) + '\n') else: # Manta doesn't use the SECONDARY flag # So, just check whether the mate ID is in bnds # Which means that the mate has already been seen if 'MATEID' in var.info: mate_id = var.info['MATEID'] if mate_id in bnds: var1 = bnds[mate_id] bedpe_out.write(str(converter.convert(var1, var)) + '\n') del bnds[mate_id] else: bnds.update({unique_name: var}) else: bnds.update({unique_name: var}) continue intersected_keys = bnds.viewkeys() & sec_bnds.viewkeys() for key in intersected_keys: bedpe_out.write( str(converter.convert(bnds[key], sec_bnds[key])) + '\n') del bnds[key] del sec_bnds[key] if bnds is not None: for bnd in bnds: sys.stderr.write( 'Warning: missing secondary multiline variant at ID:' + bnd + '\n') bedpe_out.write(str(converter.convert(bnds[bnd], None)) + '\n') if sec_bnds is not None: for bnd in sec_bnds: sys.stderr.write( 'Warning: missing primary multiline variant at ID:' + bnd + '\n') bedpe_out.write(str(converter.convert(None, sec_bnds[bnd])) + '\n') # close the files bedpe_out.close() return