예제 #1
0
파일: run.py 프로젝트: friend1ws/sv_utils
def vcf_main(args):

    # generate bedpe file
    hout = open(args.output, 'w')
    with open(args.result_file, 'r') as hin:
        for line in hin:
            if line.startswith("Chr_1" + '\t' + "Pos_1"): 
                header_info.read(line.rstrip('\n'))
                continue

            F = line.rstrip('\n').split('\t')

            if F[header_info.variant_type] in ["inversion", "translocation"]: continue
            if abs(int(F[header_info.pos_1]) - int(F[header_info.pos_2])) > int(args.max_size_thres): continue

            if F[header_info.variant_type] == "deletion":
                ref_seq = my_seq.get_seq(args.reference, F[header_info.chr_1], int(F[header_info.pos_1]), int(F[header_info.pos_2]) - 1)
                alt_seq = ref_seq[0] if F[header_info.inserted_seq] == "---" else ref_seq[0] + F[header_info.inserted_seq] 
                pos = F[1]
            elif F[header_info.variant_type] == "tandem_duplication":
                alt_seq = my_seq.get_seq(args.reference, F[header_info.chr_1], int(F[header_info.pos_1]) - 1, int(F[header_info.pos_2]))
                alt_seq = alt_seq if F[header_info.inserted_seq] == "---" else alt_seq + F[header_info.inserted_seq] 
                ref_seq = alt_seq[0]
                pos = str(int(F[header_info.pos_1]) - 1)

            print >> hout, '\t'.join([F[header_info.chr_1], pos, '.', ref_seq, alt_seq, '.', "PASS", '.']) 

    hout.close()
예제 #2
0
def generate_contig(input_file, output_file, tumor_bp_file, tumor_bam,
                    reference_genome, min_contig_length):

    tumor_bp_db = pysam.TabixFile(tumor_bp_file)

    readid2key = {}
    with open(input_file, 'r') as hin:
        for line in hin:
            F = line.rstrip('\n').split('\t')
            if F[0] == "Chr": continue

            tabixErrorFlag = 0
            try:
                records = tumor_bp_db.fetch(F[0], int(F[1]) - 1, int(F[1]) + 1)
            except Exception as inst:
                print >> sys.stderr, "%s: %s" % (type(inst), inst.args)
                tabixErrorMsg = str(inst.args)
                tabixErrorFlag = 1

            if tabixErrorFlag == 0:
                for record_line in records:
                    record = record_line.split('\t')
                    if record[0] == F[0] and (int(record[1]) + 1) == int(
                            F[1]) and record[3] == F[2] and record[4] == F[3]:
                        for readid in record[5].split(';'):
                            readid2key[re.sub(r'/\d$', '',
                                              readid)] = ','.join(F[:4])

    bamfile = pysam.Samfile(tumor_bam, "rb")

    hout = open(output_file + ".tmp2.contig.unsorted", 'w')
    for read in bamfile.fetch():

        if read.qname in readid2key:
            flags = format(int(read.flag), "#014b")[:1:-1]

            # skip supplementary alignment
            if flags[8] == "1" or flags[11] == "1": continue

            # skip duplicated reads
            if flags[10] == "1": continue

            print >> hout, readid2key[read.qname] + '\t' + read.qname + (
                "/1" if flags[6] == "1" else "/2") + '\t' + read.query_sequence

    hout.close()

    hout = open(output_file + ".tmp2.contig.sorted", 'w')
    subprocess.call(["sort", "-k1,1", output_file + ".tmp2.contig.unsorted"],
                    stdout=hout)
    hout.close()

    temp_key = ""
    temp_id2seq = {}
    temp_junc_seq = ""
    key2contig = {}
    with open(output_file + ".tmp2.contig.sorted") as hin:
        for line in hin:
            F = line.rstrip('\n').split('\t')
            if temp_key != F[0]:
                if len(temp_id2seq) > 0:
                    key2contig[temp_key] = assemble_seq(
                        temp_id2seq, temp_junc_seq, output_file)

                temp_key = F[0]
                temp_id2seq = {}
                FF = temp_key.split(',')
                if FF[2] == "+":
                    temp_junc_seq = my_seq.get_seq(reference_genome, FF[0],
                                                   int(FF[1]) - 20, int(FF[1]))
                else:
                    temp_junc_seq = my_seq.reverse_complement(
                        my_seq.get_seq(reference_genome, FF[0], int(FF[1]),
                                       int(FF[1]) + 20))

            temp_id2seq[F[1]] = F[2]

        if len(temp_id2seq) > 0:
            key2contig[temp_key] = assemble_seq(temp_id2seq, temp_junc_seq,
                                                output_file)

    hout = open(output_file, 'w')
    with open(input_file, 'r') as hin:
        for line in hin:
            F = line.rstrip('\n').split('\t')
            key = ','.join(F[:4])

            if key not in key2contig: continue
            contig = key2contig[key]
            if len(contig) < min_contig_length: continue
            # if contig[:8] != F[3][:8]: continue

            print >> hout, '\t'.join(F) + '\t' + contig

    hout.close()
def generate_template_seq(output_file, reference, mut_chr, mut_start, mut_end, mut_ref, mut_alt, 
                          motif_chr, motif_start, motif_end, motif_type, motif_strand,
                          junc_list, donor_size, acceptor_size, template_size):

    donor_size_exon, donor_size_intron = [int(x) for x in donor_size.split(',')]
    acceptor_size_intron, acceptor_size_exon = [int(x) for x in acceptor_size.split(',')]

    unique_junc_list = list(set(junc_list.split(',')))

    if motif_type == "donor":
        if motif_strand == '+':
            motif_exon_start, motif_exon_end = motif_start, motif_start + donor_size_exon - 1
            motif_intron_start, motif_intron_end = motif_start + donor_size_exon, motif_start + donor_size_exon + donor_size_intron - 1
        else:
            motif_intron_start, motif_intron_end = motif_start, motif_start + donor_size_intron - 1
            motif_exon_start, motif_exon_end = motif_start + donor_size_intron, motif_start + donor_size_exon + donor_size_intron - 1
    else: # acceptor
        if motif_strand == '+':
            motif_intron_start, motif_intron_end = motif_start, motif_start + acceptor_size_intron - 1
            motif_exon_start, motif_exon_end = motif_start + acceptor_size_intron, motif_start + acceptor_size_exon + acceptor_size_intron - 1
        else:
            motif_exon_start, motif_exon_end = motif_start, motif_start + acceptor_size_exon - 1
            motif_intron_start, motif_intron_end = motif_start + acceptor_size_exon, motif_start + acceptor_size_exon + acceptor_size_intron - 1

    key2seq = {}  
    # annotated splice junction without mutation
    cnum = 0
    for junc in sorted(unique_junc_list):
        junc_match = re.match(r'([^ \t\n\r\f\v,]+)\:(\d+)\-(\d+)', junc)
        junc_chr, junc_start, junc_end = junc_match.group(1), int(junc_match.group(2)), int(junc_match.group(3))

        seq = my_seq.get_seq(reference, junc_chr, junc_start - template_size + 1, junc_start) + \
              my_seq.get_seq(reference, junc_chr, junc_end, junc_end + template_size - 1)
        key2seq["splice_junction_negative_" + str(cnum)] = seq
        cnum = cnum + 1


    # annotated splice junction with mutation (only when mutations occur within exonic motif region)
    if mut_start >= motif_exon_start and mut_end <= motif_exon_end:
        cnum = 0
        for junc in sorted(unique_junc_list):
            junc_match = re.match(r'([^ \t\n\r\f\v,]+)\:(\d+)\-(\d+)', junc)
            junc_chr, junc_start, junc_end = junc_match.group(1), int(junc_match.group(2)), int(junc_match.group(3))

            if (motif_type == "donor" and motif_strand == '+') or (motif_type == "acceptor" and motif_strand == '-'):
                mut_seq_tmp = my_seq.get_seq(reference, junc_chr, junc_start - mut_seq_margin + 1, junc_start)
                mut_start_rel, mut_end_rel = mut_start - junc_start + mut_seq_margin - 1, mut_end - junc_start + mut_seq_margin - 1
            else:
                mut_seq_tmp = my_seq.get_seq(reference, junc_chr, junc_end, junc_end + mut_seq_margin - 1)
                mut_start_rel, mut_end_rel = mut_start - junc_end, mut_end - junc_end

            # for debug
            if mut_ref != '-' and mut_seq_tmp[mut_start_rel:(mut_end_rel + 1)] != mut_ref:
                print >> sys.stderr, '\t'.join([mut_chr, str(mut_start), str(mut_end), mut_ref, mut_alt, junc])
                print >> sys.stderr, '\t'.join([mut_seq_tmp[mut_start_rel:(mut_end_rel + 1)], mut_ref])
                print >> sys.stderr, "mutation inconsistent!!!"
                sys.exit(1) 

            # SNV
            if mut_ref != '-' and mut_alt != '-': mut_seq_tmp = mut_seq_tmp[:mut_start_rel] + mut_alt + mut_seq_tmp[(mut_end_rel + 1):]

            # deletion
            if mut_alt == '-': mut_seq_tmp = mut_seq_tmp[:mut_start_rel] + mut_seq_tmp[(mut_end_rel + 1):]

            # insertion
            if mut_ref == '-': mut_seq_tmp = mut_seq_tmp[:(mut_start_rel + 1)] + mut_alt + mut_seq_tmp[(mut_start_rel + 1):]

            if (motif_type == "donor" and motif_strand == '+') or (motif_type == "acceptor" and motif_strand == '-'):
                seq = mut_seq_tmp[(-template_size):] + my_seq.get_seq(reference, junc_chr, junc_end, junc_end + template_size - 1)
            else:
                seq = my_seq.get_seq(reference, junc_chr, junc_start - template_size + 1, junc_start) + mut_seq_tmp[:(template_size)]

            key2seq["splice_junction_positive_" + str(cnum)] = seq
            cnum = cnum + 1

            
    # intron retention without mutation
    if (motif_type == "donor" and motif_strand == '+') or (motif_type == "acceptor" and motif_strand == '-'):
        seq_left_tmp = my_seq.get_seq(reference, junc_chr, motif_exon_end - mut_seq_margin + 1, motif_exon_end)
        seq_right_tmp = my_seq.get_seq(reference, junc_chr, motif_intron_start, motif_intron_start + mut_seq_margin - 1)
        boundary_pos = motif_exon_end
    else:
        seq_left_tmp = my_seq.get_seq(reference, junc_chr, motif_intron_end - mut_seq_margin + 1, motif_intron_end)
        seq_right_tmp = my_seq.get_seq(reference, junc_chr, motif_exon_start, motif_exon_start + mut_seq_margin - 1)
        boundary_pos = motif_intron_end
    key2seq["intron_retention_negative"] = seq_left_tmp[(-template_size):] + seq_right_tmp[:(template_size)]

   
    # intron retention with mutation
    mut_seq_left_tmp, mut_seq_right_tmp = seq_left_tmp, seq_right_tmp

    # in this case, we remove nucleotides from concatenated sequences
    mut_start_rel, mut_end_rel = mut_start - boundary_pos + mut_seq_margin - 1, mut_end - boundary_pos + mut_seq_margin - 1
    mut_seq_tmp = mut_seq_left_tmp + mut_seq_right_tmp


    # SNV
    if mut_ref != '-' and mut_alt != '-': 

        # for debug
        if mut_seq_tmp[mut_start_rel] != mut_ref:
            print >> sys.stderr, "mutation inconsistent!!!"
            sys.exit(1)
        mut_seq_tmp = mut_seq_tmp[:mut_start_rel] + mut_alt + mut_seq_tmp[(mut_end_rel + 1):]

        mut_seq_start_pos = mut_seq_margin - template_size
        mut_seq_end_pos = mut_seq_start_pos + 2 * template_size
        key2seq["intron_retention_positive"] = mut_seq_tmp[mut_seq_start_pos:mut_seq_end_pos]


    elif mut_alt == '-': # deletion

        # for debug
        if mut_seq_tmp[mut_start_rel:(mut_end_rel + 1)] != mut_ref != '-':
            print >> sys.stderr, "mutation inconsistent!!!"
            sys.exit(1)
        mut_seq_tmp = mut_seq_tmp[:mut_start_rel] + mut_seq_tmp[(mut_end_rel + 1):]
 
        del_size_left = max(0, (min(boundary_pos, mut_end) - mut_start))
        mut_seq_start_pos = mut_seq_margin - template_size - del_size_left
        mut_seq_end_pos = mut_seq_start_pos + 2 * template_size

        key2seq["intron_retention_positive"] = mut_seq_tmp[mut_seq_start_pos:mut_seq_end_pos]


    elif mut_ref == '-': #insertion
     
        mut_seq_tmp = mut_seq_tmp[:(mut_start_rel + 1)] + mut_alt + mut_seq_tmp[(mut_start_rel + 1):]
        ins_size_left = len(mut_alt)
        mut_seq_start_pos = mut_seq_margin - template_size + ins_size_left
        mut_seq_end_pos = mut_seq_start_pos + 2 * template_size
        key2seq["intron_retention_positive"] = mut_seq_tmp[mut_seq_start_pos:mut_seq_end_pos]


    hout = open(output_file, 'w')
    for key in sorted(key2seq):
        print >> hout, ">" + key + '\n' + key2seq[key]
    
    hout.close()
예제 #4
0
파일: utils.py 프로젝트: friend1ws/sv_utils
def make_mut_db(input_file, output_file_prefix, reference):

    hout = open(output_file_prefix + ".bed", "w")
    with open(input_file, "r") as hin:
        ref_ind = -1
        alt_ind = -1
        tum_ref_ind = -1
        nor_ref_ind = -1
        tum_var_ind = -1
        nor_var_ind = -1
        fisher_ind = -1
        header = hin.readline().rstrip("\n").split("\t")
        for i in range(0, len(header)):
            if header[i] == "Ref":
                ref_ind = i
            if header[i] == "Alt":
                alt_ind = i
            if header[i] == "readPairNum_tumor":
                tum_ref_ind = i
            if header[i] == "variantPairNum_tumor":
                tum_var_ind = i
            if header[i] == "readPairNum_normal":
                nor_ref_ind = i
            if header[i] == "variantPairNum_normal":
                nor_var_ind = i
            if header[i] == "P-value(fhsher_realignment)":
                fisher_ind = i

        for line in hin:
            F = line.rstrip("\n").split("\t")
            if len(F[ref_ind]) >= 10 or len(F[alt_ind]) >= 10:

                bed_key = F[0] + "\t" + str(int(F[1]) - 1) + "\t" + F[2]
                read_info = (
                    F[tum_ref_ind]
                    + "\t"
                    + F[tum_var_ind]
                    + "\t"
                    + str(round(float(F[tum_var_ind]) / (float(F[tum_ref_ind]) + float(F[tum_var_ind])), 3))
                    + "\t"
                    + F[nor_ref_ind]
                    + "\t"
                    + F[nor_var_ind]
                    + "\t"
                    + str(round(float(F[nor_var_ind]) / (float(F[nor_ref_ind]) + float(F[nor_var_ind])), 3))
                    + "\t"
                    + F[fisher_ind]
                )

                var_info = ""
                # deletion
                if len(F[ref_ind]) >= 10:
                    var_info = (
                        F[0]
                        + "\t"
                        + str(int(F[1]) - 1)
                        + "\t"
                        + "+"
                        + "\t"
                        + F[0]
                        + "\t"
                        + str(int(F[1]) + len(F[ref_ind]))
                        + "\t"
                        + "-"
                        + "\t"
                        + "---"
                        + "\t"
                        + "deletion"
                    )
                    # gene_annotation = get_gene_annotation(F[0],  str(int(F[1]) - 1), F[0], str(int(F[1]) + len(F[ref_ind])), gene_tb, exon_tb)
                    gene_annotation = "---" + "\t" + "---" + "\t" + "---" + "\t" + "---"
                    print >> hout, bed_key + "\t" + var_info + "\t" + gene_annotation + "\t" + read_info

                # tandem_duplication
                elif len(F[alt_ind]) >= 10:
                    # tandem_duplication check
                    flanking_seq_1 = my_seq.get_seq(reference, F[0], int(F[1]) + 1, int(F[1]) + len(F[alt_ind]))
                    flanking_seq_1_match = my_seq.exact_alignment(F[alt_ind], flanking_seq_1)
                    flanking_seq_2 = my_seq.get_seq(reference, F[0], int(F[1]) - len(F[alt_ind]) + 1, int(F[1]))
                    flanking_seq_2_match = my_seq.exact_alignment(F[alt_ind], flanking_seq_2)
                    # print '\t'.join(F[0:4])
                    # print F[alt_ind] + '\t' + flanking_seq_1 + '\t' + str(flanking_seq_1_match)
                    # print F[alt_ind] + '\t' + flanking_seq_2 + '\t' + str(flanking_seq_2_match)

                    if flanking_seq_1_match == len(F[alt_ind]) or flanking_seq_2_match == len(F[alt_ind]):
                        var_info = (
                            F[0]
                            + "\t"
                            + str(int(F[1]) + 1)
                            + "\t"
                            + "-"
                            + "\t"
                            + F[0]
                            + "\t"
                            + str(int(F[1]) + len(F[alt_ind]))
                            + "\t"
                            + "+"
                            + "\t"
                            + "---"
                            + "\t"
                            + "tandem_duplication"
                        )
                        # gene_annotation = get_gene_annotation(F[0],  str(int(F[1]) + 1), F[0], str(int(F[1]) + len(F[alt_ind])), gene_tb, exon_tb)
                        gene_annotation = "---" + "\t" + "---" + "\t" + "---" + "\t" + "---"
                        print >> hout, bed_key + "\t" + var_info + "\t" + gene_annotation + "\t" + read_info

    hout.close()

    subprocess.call(["bgzip", "-f", output_file_prefix + ".bed"])
    subprocess.call(["tabix", "-p", "bed", output_file_prefix + ".bed.gz"])