def main():
    usage = """%prog <VCF file 1> <VCF file 2> ... <VCF file N>

l_sort
Author: Ryan Layer, Colby Chiang, & Ira Hall
Description: sort N VCF files into a single file
Version: 0.01
"""

    if len(sys.argv) < 2:
        exit(usage)

    vcf_file_names = sys.argv[1:]

    vcf_lines = []
    vcf_headers = list()

    for vcf_file_name in vcf_file_names:
        samples = l_bp.parse_vcf(vcf_file_name, vcf_lines, vcf_headers)
        for sample in samples:
            vcf_headers.append("##SAMPLE=<ID=" + sample + ">\n")

    vcf_headers.append("##INFO=<ID=SNAME,Number=.,Type=String," + \
            "Description=\"Source sample name\">\n")

    vcf_headers.append("##INFO=<ID=ALG,Number=1,Type=String," + \
            "Description=\"Evidence PDF aggregation algorithm\">\n")


    vcf_headers.append("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\t" + \
            "VARIOUS\n")

    vcf_headers = list(vcf_headers)
    vcf_headers.sort(cmp=l_bp.header_line_cmp)
    for h in vcf_headers:
        print(h, end=' ')

    vcf_lines.sort(cmp=l_bp.vcf_line_cmp)
    for v in vcf_lines:
        #        if 'SVTYPE=BND' in v and (('--:' in v) != ('++' in v)):
        #            A = v.split('\t')
        #            neg_s = A[7].find('--:')
        #            pos_s = A[7].find('++:')
        #
        #            if neg_s > 0:
        #                neg_e = neg_s + A[7][neg_s:].find(';')
        #                pre=A[7][:neg_s]
        #                mid=A[7][neg_s:neg_e]
        #                post=A[7][neg_e:]
        #                A[7] = pre + '++:0,' + mid + post
        #            else:
        #                pos_e = pos_s + A[7][pos_s:].find(';')
        #                pre=A[7][:pos_s]
        #                mid=A[7][pos_s:pos_e]
        #                post=A[7][pos_e:]
        #                A[7] = pre + mid + ',--:0' + post
        #            print '\t'.join(A)
        #        else:
        print(v, end=' ')
Exemplo n.º 2
0
def main():
    usage ="""%prog <VCF file 1> <VCF file 2> ... <VCF file N>

l_sort
Author: Ryan Layer, Colby Chiang, & Ira Hall
Description: sort N VCF files into a single file
Version: 0.01
"""

    if len(sys.argv) < 2:
        exit(1)

    vcf_file_names = sys.argv[1:]

    vcf_lines = []
    vcf_headers = list()

    for vcf_file_name in vcf_file_names:
        samples = l_bp.parse_vcf(vcf_file_name, vcf_lines, vcf_headers)
        for sample in samples:
            vcf_headers.append("##SAMPLE=<ID=" + sample + ">\n")

    vcf_headers.append("##INFO=<ID=SNAME,Number=.,Type=String," + \
            "Description=\"Source sample name\">\n")

    vcf_headers.append("##INFO=<ID=ALG,Number=1,Type=String," + \
            "Description=\"Evidence PDF aggregation algorithm\">\n")


    vcf_headers.append("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\t" + \
            "VARIOUS\n")

    vcf_headers = list(vcf_headers)
    vcf_headers.sort(cmp=l_bp.header_line_cmp)
    for h in vcf_headers:
        print h,

    vcf_lines.sort(cmp=l_bp.vcf_line_cmp)
    for v in vcf_lines:
#        if 'SVTYPE=BND' in v and (('--:' in v) != ('++' in v)):
#            A = v.split('\t')
#            neg_s = A[7].find('--:')
#            pos_s = A[7].find('++:')
#
#            if neg_s > 0:
#                neg_e = neg_s + A[7][neg_s:].find(';') 
#                pre=A[7][:neg_s]
#                mid=A[7][neg_s:neg_e]
#                post=A[7][neg_e:]
#                A[7] = pre + '++:0,' + mid + post
#            else:
#                pos_e = pos_s + A[7][pos_s:].find(';') 
#                pre=A[7][:pos_s]
#                mid=A[7][pos_s:pos_e]
#                post=A[7][pos_e:]
#                A[7] = pre + mid + ',--:0' + post
#            print '\t'.join(A)
#        else:
            print v,
Exemplo n.º 3
0
def l_cluster(file_name, percent_slop=0, fixed_slop=0):
    v_id = 0
    vcf_lines = []
    vcf_headers = Set()
    r = l_bp.parse_vcf(file_name, vcf_lines, vcf_headers, add_sname=False)

    vcf_headers.add("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n")


    sample_order = []
    for header in vcf_headers:
        if header[:8] == '##SAMPLE':
            sample_order.append(header.rstrip()[13:-1])
        #elif header[:8] == '##FORMAT':
            #i,n,t=header[header.find('<')+1:header.find('>')].split(',')[0:3]
            #print i,n,t

    #exit(1)

    vcf_headers = list(vcf_headers)
    vcf_headers.sort(cmp=l_bp.header_line_cmp)
    for h in vcf_headers:
        print h,

    BP_l = []
    BP_sv_type = ''
    BP_max_end_l = -1
    BP_chr_l = ''

    for l in vcf_lines:
        b = l_bp.breakpoint(l,
                            percent_slop=percent_slop,
                            fixed_slop=fixed_slop)

        if (len(BP_l) == 0) or \
           ((b.start_l <= BP_max_end_l) and \
            (b.chr_l == BP_chr_l) and \
            (b.sv_type == BP_sv_type)):
            BP_l.append(b)
            BP_max_end_l = max(BP_max_end_l, b.end_l)
            BP_chr_l = b.chr_l
            BP_sv_type = b.sv_type
        else:
            #print len(BP_l)
            v_id = r_cluster(BP_l, sample_order, v_id)
            BP_l = [b]
            BP_max_end_l = b.end_l
            BP_sv_type = b.sv_type
            BP_chr_l = b.chr_l

    if len(BP_l) > 0:
        #print len(BP_l)
        v_id = r_cluster(BP_l, sample_order, v_id)
Exemplo n.º 4
0
def l_cluster(file_name, percent_slop=0, fixed_slop=0):
    v_id = 0
    vcf_lines = []
    vcf_headers = Set()
    r = l_bp.parse_vcf(file_name, vcf_lines, vcf_headers, add_sname=False)

    vcf_headers.add("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n")

    sample_order = []
    for header in vcf_headers:
        if header[:8] == '##SAMPLE':
            sample_order.append(header.rstrip()[13:-1])
        #elif header[:8] == '##FORMAT':
        #i,n,t=header[header.find('<')+1:header.find('>')].split(',')[0:3]
        #print i,n,t

    #exit(1)

    vcf_headers = list(vcf_headers)
    vcf_headers.sort(cmp=l_bp.header_line_cmp)
    for h in vcf_headers:
        print h,

    BP_l = []
    BP_sv_type = ''
    BP_max_end_l = -1
    BP_chr_l = ''

    for l in vcf_lines:
        b = l_bp.breakpoint(l,
                            percent_slop=percent_slop,
                            fixed_slop=fixed_slop)

        if (len(BP_l) == 0) or \
           ((b.start_l <= BP_max_end_l) and \
            (b.chr_l == BP_chr_l) and \
            (b.sv_type == BP_sv_type)):
            BP_l.append(b)
            BP_max_end_l = max(BP_max_end_l, b.end_l)
            BP_chr_l = b.chr_l
            BP_sv_type = b.sv_type
        else:
            #print len(BP_l)
            v_id = r_cluster(BP_l, sample_order, v_id)
            BP_l = [b]
            BP_max_end_l = b.end_l
            BP_sv_type = b.sv_type
            BP_chr_l = b.chr_l

    if len(BP_l) > 0:
        #print len(BP_l)
        v_id = r_cluster(BP_l, sample_order, v_id)
Exemplo n.º 5
0
def main():
    usage ="""%prog <VCF file 1> <VCF file 2> ... <VCF file N>

l_sort
Author: Ryan Layer, Colby Chiang, & Ira Hall
Description: sort N VCF files into a single file
Version: 0.01
"""

    if len(sys.argv) < 2:
        exit(1)

    vcf_file_names = sys.argv[1:]

    vcf_lines = []
    vcf_headers = Set()

    vcf_headers.add("##INFO=<ID=SVNAME,Number=.,Type=String," + \
            "Description=\"Source sample name\">\n")

    vcf_headers.add("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\t" + \
            "VARIOUS\n")

    for vcf_file_name in vcf_file_names:
        samples = l_bp.parse_vcf(vcf_file_name, vcf_lines, vcf_headers)
        for sample in samples:
            vcf_headers.add("##SAMPLE=<ID=" + sample + ">\n")

    vcf_headers = list(vcf_headers)
    vcf_headers.sort(cmp=l_bp.header_line_cmp)
    for h in vcf_headers:
        print h,

    vcf_lines.sort(cmp=l_bp.vcf_line_cmp)
    for v in vcf_lines:
        print v,