Exemplo n.º 1
0
 def setUp(self):
     self.converter = VcfToBedpeConverter()
     header_lines = [
         '##fileformat=VCFv4.2', '##fileDate=20090805',
         '##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta',
         '##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">',
         '##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">',
         '##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant">',
         '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
         '#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA00001'
     ]
     self.vcf = Vcf()
     self.vcf.add_header(header_lines)
Exemplo n.º 2
0
 def setUp(self):
     self.converter = VcfToBedpeConverter()
     header_lines = [
             '##fileformat=VCFv4.2',
             '##fileDate=20090805',
             '##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta',
             '##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">',
             '##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">',
             '##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant">',
             '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
             '#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA00001']
     self.vcf = Vcf()
     self.vcf.add_header(header_lines)
Exemplo n.º 3
0
def vcfToBedpe(vcf_file, bedpe_out):
    converter = VcfToBedpeConverter()
    vcf = svtools.vcf.file.Vcf()
    in_header = True
    header = []
    sample_list = []
    bnds = dict()
    sec_bnds = dict()
    v = []
    for line in vcf_file:
        if in_header:
            if line[0:2] == '##':
                if line.split('=')[0] == '##fileformat':
                    line = '##fileformat=' + "BEDPE" + '\n'
                if line.split('=')[0] == '##fileDate':
                    line = '##fileDate=' + time.strftime('%Y%m%d') + '\n'
                header.append(line)
                continue
            elif line[0] == '#' and line[1] != '#':
                sample_list = line.rstrip().split('\t')[9:]
                header.append(line)
                continue
            else:
                # print header
                in_header = False
                vcf.add_header(header)
                if "SVTYPE" in [info.id for info in vcf.info_list]:
                   vcf.add_info_after("SVTYPE", "POS", 1, 'Integer', 'Position of the variant described in this record')
                header=vcf.get_header()
                bedpe_out.write(header[:header.rfind('\n')] + '\n')
                final_header_line = ['#CHROM_A',
                        'START_A',
                        'END_A',
                        'CHROM_B',
                        'START_B',
                        'END_B',
                        'ID',
                        'QUAL',
                        'STRAND_A',
                        'STRAND_B',
                        'TYPE',
                        'FILTER',
                        'NAME_A',
                        'REF_A',
                        'ALT_A',
                        'NAME_B',
                        'REF_B',
                        'ALT_B',
                        'INFO_A',
                        'INFO_B']

                if len(sample_list) > 0:
                    bedpe_out.write('\t'.join(final_header_line + ['FORMAT','\t'.join(map(str,sample_list))]) + '\n')
                else:
                    bedpe_out.write('\t'.join(final_header_line) + '\n')

        v = line.rstrip().split('\t')
        var = svtools.vcf.variant.Variant(v, vcf)
        var.set_info("POS", var.pos)
        unique_name = var.var_id
        # The EVENT ID is not unique. Stick with ID column.
        # if 'EVENT' in var.info:
        #     unique_name = var.info['EVENT']
        if var.info['SVTYPE'] != 'BND':
            bedpe_out.write(str(converter.convert(var)) + '\n')
        else:
            # Manta doesn't use the SECONDARY flag
            # So, just check whether the mate ID is in bnds
            # Which means that the mate has already been seen
            if 'MATEID' in var.info:
                mate_id = var.info['MATEID']
                if mate_id in bnds:
                    var1 = bnds[mate_id]
                    bedpe_out.write(str(converter.convert(var1, var)) + '\n')
                    del bnds[mate_id]
                else:
                    bnds.update({unique_name:var})
            else:
                bnds.update({unique_name:var})
                continue
    intersected_keys = bnds.viewkeys() & sec_bnds.viewkeys()
    for key in intersected_keys:
        bedpe_out.write(str(converter.convert(bnds[key], sec_bnds[key])) + '\n')
        del bnds[key]
        del sec_bnds[key]
    if bnds is not None:
        for bnd in bnds:
            sys.stderr.write('Warning: missing secondary multiline variant at ID:' + bnd + '\n')
            bedpe_out.write(str(converter.convert(bnds[bnd], None)) + '\n')
    if sec_bnds is not None:
        for bnd in sec_bnds:
            sys.stderr.write('Warning: missing primary multiline variant at ID:' + bnd + '\n')
            bedpe_out.write(str(converter.convert(None, sec_bnds[bnd])) + '\n')

    # close the files
    bedpe_out.close()
    return
Exemplo n.º 4
0
class TestVcfToBedpeConverter(TestCase):
    def setUp(self):
        self.converter = VcfToBedpeConverter()
        header_lines = [
            '##fileformat=VCFv4.2', '##fileDate=20090805',
            '##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta',
            '##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">',
            '##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">',
            '##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant">',
            '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
            '#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA00001'
        ]
        self.vcf = Vcf()
        self.vcf.add_header(header_lines)

    def test_bnd_alt_string(self):
        self.assertEqual(self.converter.parse_bnd_alt_string('A[1:6['),
                         ('[', '1', 6))
        self.assertEqual(self.converter.parse_bnd_alt_string('A]1:6]'),
                         (']', '1', 6))
        self.assertEqual(self.converter.parse_bnd_alt_string(']1:6]A'),
                         (']', '1', 6))
        with self.assertRaises(AssertionError):
            self.converter.parse_bnd_alt_string(']1:6[A')
        with self.assertRaises(AssertionError):
            self.converter.parse_bnd_alt_string('1')

    def test_bnd_breakpoints(self):
        vcf_array1 = [
            '1', '20000', '235', 'T', 'A[1:6[', '0.00', '.', '.', 'GT', '0/0'
        ]
        v1 = Variant(vcf_array1, self.vcf)
        self.assertEqual(self.converter.bnd_breakpoints(v1),
                         ('1', 20000, 20000, '1', 5, 5, '+', '-'))
        vcf_array2 = [
            '1', '20000', '235', 'T', ']1:6]N', '0.00', '.', '.', 'GT', '0/0'
        ]
        v2 = Variant(vcf_array2, self.vcf)
        self.assertEqual(self.converter.bnd_breakpoints(v2),
                         ('1', 19999, 19999, '1', 6, 6, '-', '+'))

    def test_simple_breakpoints(self):
        vcf_array1 = [
            '1', '20000', '235', 'T', '<DEL>', '0.00', '.', 'END=20500', 'GT',
            '0/0'
        ]
        v1 = Variant(vcf_array1, self.vcf)
        self.assertEqual(self.converter.simple_breakpoints(v1),
                         ('1', 20000, 20000, '1', 20500, 20500, '+', '-'))
        vcf_array2 = [
            '1', '20000', '235', 'T', '<DEL>', '0.00', '.',
            'END=20500;STRANDS=-+:2', 'GT', '0/0'
        ]
        v2 = Variant(vcf_array2, self.vcf)
        self.assertEqual(self.converter.simple_breakpoints(v2),
                         ('1', 20000, 20000, '1', 20500, 20500, '-', '+'))
        vcf_array3 = [
            '1', '20000', '235', 'T', '<DEL>', '0.00', '.', 'STRANDS=--:2',
            'GT', '0/0'
        ]
        v3 = Variant(vcf_array3, self.vcf)
        with self.assertRaises(ValueError):
            self.converter.simple_breakpoints(v3)

    def test_adjust_coordinate(self):
        vcf_array1 = [
            '1', '20000', '235', 'T', '<DEL>', '0.00', '.', 'CIEND=-50,50',
            'GT', '0/0'
        ]
        v1 = Variant(vcf_array1, self.vcf)
        self.assertEqual(
            self.converter.adjust_coordinate(v1, 'CIEND', 500, 1000),
            (450, 1050))
        self.assertEqual(
            self.converter.adjust_coordinate(v1, 'CIPOS', 500, 1000),
            (500, 1000))
        vcf_array2 = [
            '1', '20000', '235', 'T', '<DEL>', '0.00', '.', 'CIEND=50', 'GT',
            '0/0'
        ]
        v2 = Variant(vcf_array2, self.vcf)
        with self.assertRaises(ValueError):
            self.converter.adjust_coordinate(v2, 'CIEND', 500, 1000)
Exemplo n.º 5
0
class TestVcfToBedpeConverter(TestCase):
    def setUp(self):
        self.converter = VcfToBedpeConverter()
        header_lines = [
                '##fileformat=VCFv4.2',
                '##fileDate=20090805',
                '##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta',
                '##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">',
                '##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">',
                '##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant">',
                '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
                '#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA00001']
        self.vcf = Vcf()
        self.vcf.add_header(header_lines)

    def test_bnd_alt_string(self):
        self.assertEqual(self.converter.parse_bnd_alt_string('A[1:6['), ('[', '1', 6))
        self.assertEqual(self.converter.parse_bnd_alt_string('A]1:6]'), (']', '1', 6))
        self.assertEqual(self.converter.parse_bnd_alt_string(']1:6]A'), (']', '1', 6))
        with self.assertRaises(AssertionError):
            self.converter.parse_bnd_alt_string(']1:6[A')
        with self.assertRaises(AssertionError):
            self.converter.parse_bnd_alt_string('1')

    def test_bnd_breakpoints(self):
        vcf_array1 = ['1', '20000', '235', 'T', 'A[1:6[', '0.00', '.', '.', 'GT', '0/0']
        v1 = Variant(vcf_array1, self.vcf)
        self.assertEqual(
                self.converter.bnd_breakpoints(v1),
                ('1', 20000, 20000, '1', 5, 5, '+', '-'))
        vcf_array2 = ['1', '20000', '235', 'T', ']1:6]N', '0.00', '.', '.', 'GT', '0/0']
        v2 = Variant(vcf_array2, self.vcf)
        self.assertEqual(
                self.converter.bnd_breakpoints(v2),
                ('1', 19999, 19999, '1', 6, 6, '-', '+'))

    def test_simple_breakpoints(self):
        vcf_array1 = ['1', '20000', '235', 'T', '<DEL>', '0.00', '.', 'END=20500', 'GT', '0/0']
        v1 = Variant(vcf_array1, self.vcf)
        self.assertEqual(
                self.converter.simple_breakpoints(v1),
                ('1', 20000, 20000, '1', 20500, 20500, '+', '-'))
        vcf_array2 = ['1', '20000', '235', 'T', '<DEL>', '0.00', '.', 'END=20500;STRANDS=-+:2', 'GT', '0/0']
        v2 = Variant(vcf_array2, self.vcf)
        self.assertEqual(
                self.converter.simple_breakpoints(v2),
                ('1', 20000, 20000, '1', 20500, 20500, '-', '+'))
        vcf_array3 = ['1', '20000', '235', 'T', '<DEL>', '0.00', '.', 'STRANDS=--:2', 'GT', '0/0']
        v3 = Variant(vcf_array3, self.vcf)
        with self.assertRaises(ValueError):
            self.converter.simple_breakpoints(v3)

    def test_adjust_coordinate(self):
        vcf_array1 = ['1', '20000', '235', 'T', '<DEL>', '0.00', '.', 'CIEND=-50,50', 'GT', '0/0']
        v1 = Variant(vcf_array1, self.vcf)
        self.assertEqual(
                self.converter.adjust_coordinate(v1, 'CIEND', 500, 1000),
                (450, 1050))
        self.assertEqual(
                self.converter.adjust_coordinate(v1, 'CIPOS', 500, 1000),
                (500, 1000))
        vcf_array2 = ['1', '20000', '235', 'T', '<DEL>', '0.00', '.', 'CIEND=50', 'GT', '0/0']
        v2 = Variant(vcf_array2, self.vcf)
        with self.assertRaises(ValueError):
            self.converter.adjust_coordinate(v2, 'CIEND', 500, 1000)
Exemplo n.º 6
0
def vcfToBedpe(vcf_file, bedpe_out):
    converter = VcfToBedpeConverter()
    vcf = svtools.vcf.file.Vcf()
    in_header = True
    header = []
    sample_list = []
    bnds = dict()
    sec_bnds = dict()
    v = []
    for line in vcf_file:
        if in_header:
            if line[0:2] == '##':
                if line.split('=')[0] == '##fileformat':
                    line = '##fileformat=' + "BEDPE" + '\n'
                if line.split('=')[0] == '##fileDate':
                    line = '##fileDate=' + time.strftime('%Y%m%d') + '\n'
                header.append(line)
                continue
            elif line[0] == '#' and line[1] != '#':
                sample_list = line.rstrip().split('\t')[9:]
                header.append(line)
                continue
            else:
                # print header
                in_header = False
                vcf.add_header(header)
                if "SVTYPE" in [info.id for info in vcf.info_list]:
                   vcf.add_info_after("SVTYPE", "POS", 1, 'Integer', 'Position of the variant described in this record')
                header=vcf.get_header()
                bedpe_out.write(header[:header.rfind('\n')] + '\n')
                final_header_line = ['#CHROM_A',
                        'START_A',
                        'END_A',
                        'CHROM_B',
                        'START_B',
                        'END_B',
                        'ID',
                        'QUAL',
                        'STRAND_A',
                        'STRAND_B',
                        'TYPE',
                        'FILTER',
                        'NAME_A',
                        'REF_A',
                        'ALT_A',
                        'NAME_B',
                        'REF_B',
                        'ALT_B',
                        'INFO_A',
                        'INFO_B']

                if len(sample_list) > 0:
                    bedpe_out.write('\t'.join(final_header_line + ['FORMAT','\t'.join(map(str,sample_list))]) + '\n')
                else:
                    bedpe_out.write('\t'.join(final_header_line) + '\n')

        v = line.rstrip().split('\t')
        var = svtools.vcf.variant.Variant(v, vcf)
        var.set_info("POS", var.pos)
        # If there is no MATEID then assume this is a single-ended BND and simply output
        if var.info['SVTYPE'] != 'BND' or 'MATEID' not in var.info:
            bedpe_out.write(str(converter.convert(var)) + '\n')
        else:
            mate_id = var.info['MATEID']
            if 'SECONDARY' in var.info:
                if mate_id in bnds:
                    #primary
                    var1 = bnds[mate_id]
                    bedpe_out.write(str(converter.convert(var1, var)) + '\n')
                    del bnds[mate_id]
                else:
                    sec_bnds.update({var.var_id:var})
            else:
                if mate_id in sec_bnds:
                    var2 = sec_bnds[mate_id]
                    bedpe_out.write(str(converter.convert(var, var2)) + '\n')
                    del sec_bnds[mate_id]
                else:
                    bnds.update({var.var_id:var})
    if bnds is not None:
        for bnd in bnds:
            sys.stderr.write('Warning: missing secondary multiline variant at ID:' + bnd + '\n')
            bedpe_out.write(str(converter.convert(bnds[bnd], None)) + '\n')
    if sec_bnds is not None:
        for bnd in sec_bnds:
            sys.stderr.write('Warning: missing primary multiline variant at ID:' + bnd + '\n')
            bedpe_out.write(str(converter.convert(None, sec_bnds[bnd])) + '\n')

    # close the files
    bedpe_out.close()
    return
Exemplo n.º 7
0
def vcfToBedpe(vcf_file, bedpe_out):
    converter = VcfToBedpeConverter()
    vcf = svtools.vcf.file.Vcf()
    in_header = True
    header = []
    sample_list = []
    bnds = dict()
    sec_bnds = dict()
    v = []
    for line in vcf_file:
        if in_header:
            if line[0:2] == '##':
                if line.split('=')[0] == '##fileformat':
                    line = '##fileformat=' + "BEDPE" + '\n'
                if line.split('=')[0] == '##fileDate':
                    line = '##fileDate=' + time.strftime('%Y%m%d') + '\n'
                header.append(line)
                continue
            elif line[0] == '#' and line[1] != '#':
                sample_list = line.rstrip().split('\t')[9:]
                header.append(line)
                continue
            else:
                # print header
                in_header = False
                vcf.add_header(header)
                if "SVTYPE" in [info.id for info in vcf.info_list]:
                    vcf.add_info_after(
                        "SVTYPE", "POS", 1, 'Integer',
                        'Position of the variant described in this record')
                header = vcf.get_header()
                bedpe_out.write(header[:header.rfind('\n')] + '\n')
                final_header_line = [
                    '#CHROM_A', 'START_A', 'END_A', 'CHROM_B', 'START_B',
                    'END_B', 'ID', 'QUAL', 'STRAND_A', 'STRAND_B', 'TYPE',
                    'FILTER', 'NAME_A', 'REF_A', 'ALT_A', 'NAME_B', 'REF_B',
                    'ALT_B', 'INFO_A', 'INFO_B'
                ]

                if len(sample_list) > 0:
                    bedpe_out.write('\t'.join(
                        final_header_line +
                        ['FORMAT', '\t'.join(map(str, sample_list))]) + '\n')
                else:
                    bedpe_out.write('\t'.join(final_header_line) + '\n')

        v = line.rstrip().split('\t')
        var = svtools.vcf.variant.Variant(v, vcf)
        var.set_info("POS", var.pos)
        # If there is no MATEID then assume this is a single-ended BND and simply output
        if var.info['SVTYPE'] != 'BND' or 'MATEID' not in var.info:
            bedpe_out.write(str(converter.convert(var)) + '\n')
        else:
            mate_id = var.info['MATEID']
            if 'SECONDARY' in var.info:
                if mate_id in bnds:
                    #primary
                    var1 = bnds[mate_id]
                    bedpe_out.write(str(converter.convert(var1, var)) + '\n')
                    del bnds[mate_id]
                else:
                    sec_bnds.update({var.var_id: var})
            else:
                if mate_id in sec_bnds:
                    var2 = sec_bnds[mate_id]
                    bedpe_out.write(str(converter.convert(var, var2)) + '\n')
                    del sec_bnds[mate_id]
                elif mate_id in bnds:
                    var1 = bnds[mate_id]
                    bedpe_out.write(str(converter.convert(var1, var)) + '\n')
                    del bnds[mate_id]
                else:
                    bnds.update({var.var_id: var})
    if bnds is not None:
        for bnd in bnds:
            sys.stderr.write(
                'Warning: missing secondary multiline variant at ID:' + bnd +
                '\n')
            bedpe_out.write(str(converter.convert(bnds[bnd], None)) + '\n')
    if sec_bnds is not None:
        for bnd in sec_bnds:
            sys.stderr.write(
                'Warning: missing primary multiline variant at ID:' + bnd +
                '\n')
            bedpe_out.write(str(converter.convert(None, sec_bnds[bnd])) + '\n')

    # close the files
    bedpe_out.close()
    return
Exemplo n.º 8
0
def vcfToBedpe(vcf_file, bedpe_out):
    converter = VcfToBedpeConverter()
    vcf = svtools.vcf.file.Vcf()
    in_header = True
    header = []
    sample_list = []
    bnds = dict()
    sec_bnds = dict()
    v = []
    for line in vcf_file:
        if in_header:
            if line[0:2] == '##':
                if line.split('=')[0] == '##fileformat':
                    line = '##fileformat=' + "BEDPE" + '\n'
                if line.split('=')[0] == '##fileDate':
                    line = '##fileDate=' + time.strftime('%Y%m%d') + '\n'
                header.append(line)
                continue
            elif line[0] == '#' and line[1] != '#':
                sample_list = line.rstrip().split('\t')[9:]
                header.append(line)
                continue
            else:
                # print header
                in_header = False
                vcf.add_header(header)
                if "SVTYPE" in [info.id for info in vcf.info_list]:
                    vcf.add_info_after(
                        "SVTYPE", "POS", 1, 'Integer',
                        'Position of the variant described in this record')
                header = vcf.get_header()
                bedpe_out.write(header[:header.rfind('\n')] + '\n')
                final_header_line = [
                    '#CHROM_A', 'START_A', 'END_A', 'CHROM_B', 'START_B',
                    'END_B', 'ID', 'QUAL', 'STRAND_A', 'STRAND_B', 'TYPE',
                    'FILTER', 'NAME_A', 'REF_A', 'ALT_A', 'NAME_B', 'REF_B',
                    'ALT_B', 'INFO_A', 'INFO_B'
                ]

                if len(sample_list) > 0:
                    bedpe_out.write('\t'.join(
                        final_header_line +
                        ['FORMAT', '\t'.join(map(str, sample_list))]) + '\n')
                else:
                    bedpe_out.write('\t'.join(final_header_line) + '\n')

        v = line.rstrip().split('\t')
        var = svtools.vcf.variant.Variant(v, vcf)
        var.set_info("POS", var.pos)
        unique_name = var.var_id
        # The EVENT ID is not unique. Stick with ID column.
        # if 'EVENT' in var.info:
        #     unique_name = var.info['EVENT']
        if var.info['SVTYPE'] != 'BND':
            bedpe_out.write(str(converter.convert(var)) + '\n')
        else:
            # Manta doesn't use the SECONDARY flag
            # So, just check whether the mate ID is in bnds
            # Which means that the mate has already been seen
            if 'MATEID' in var.info:
                mate_id = var.info['MATEID']
                if mate_id in bnds:
                    var1 = bnds[mate_id]
                    bedpe_out.write(str(converter.convert(var1, var)) + '\n')
                    del bnds[mate_id]
                else:
                    bnds.update({unique_name: var})
            else:
                bnds.update({unique_name: var})
                continue
    intersected_keys = bnds.viewkeys() & sec_bnds.viewkeys()
    for key in intersected_keys:
        bedpe_out.write(
            str(converter.convert(bnds[key], sec_bnds[key])) + '\n')
        del bnds[key]
        del sec_bnds[key]
    if bnds is not None:
        for bnd in bnds:
            sys.stderr.write(
                'Warning: missing secondary multiline variant at ID:' + bnd +
                '\n')
            bedpe_out.write(str(converter.convert(bnds[bnd], None)) + '\n')
    if sec_bnds is not None:
        for bnd in sec_bnds:
            sys.stderr.write(
                'Warning: missing primary multiline variant at ID:' + bnd +
                '\n')
            bedpe_out.write(str(converter.convert(None, sec_bnds[bnd])) + '\n')

    # close the files
    bedpe_out.close()
    return