Пример #1
0
def extractSVReadPairs(bamFilePath, outputFilePath, juncChr1, juncPos1,
                       juncDir1, juncChr2, juncPos2, juncDir2, max_depth,
                       search_length, search_margin):
    """
        read pairs containing break points are extracted. (yshira 2015/04/23)
        The exact condition is as follows:

        1. one of the read in the pair has the break point of the SV candidate
        2. the start positions of the read pairs are within 800bp of the break point of the SV candidate 

        Some minor concern for the above conditions are:
        1. Depending on the choice of the "start position" or "end position", the distance between the read and break point differs. This can generate slight bias...
        (but I believe we can recover this by setting sufficient margin (800bp), and summarize the alignment result carefully.)
        2. Maybe, for some of the read pair, the result of alignment is obvious. But should we re-align them?

    """

    bamfile = pysam.Samfile(bamFilePath, 'rb')

    # if the #sequence read is over the `maxDepth`, then that key is ignored
    depthFlag = 0
    if bamfile.count(juncChr1,
                     int(juncPos1) - 1,
                     int(juncPos1) + 1) >= max_depth:
        depthFlag = 1
    if bamfile.count(juncChr2,
                     int(juncPos2) - 1,
                     int(juncPos2) + 1) >= max_depth:
        depthFlag = 1
    if depthFlag == 1:
        print >> sys.stderr, "sequence depth exceeds the threshould for: " + ','.join(
            [juncChr1, juncPos1, juncDir1, juncChr2, juncPos2, juncDir2])
        return 1

    hOUT = open(outputFilePath, 'w')

    readID2exist = {}
    for read in bamfile.fetch(juncChr1, max(0,
                                            int(juncPos1) - search_length),
                              int(juncPos1) + search_length):

        # get the flag information
        flags = format(int(read.flag), "#014b")[:1:-1]

        # skip unmapped read
        if flags[2] == "1" or flags[3] == "1": continue

        # skip supplementary alignment
        if flags[8] == "1" or flags[11] == "1": continue

        # skip duplicated reads
        if flags[10] == "1": continue

        chr_current = bamfile.getrname(read.tid)
        pos_current = int(read.pos + 1)
        dir_current = ("-" if flags[4] == "1" else "+")
        chr_pair = bamfile.getrname(read.rnext)
        pos_pair = int(read.pnext + 1)
        dir_pair = ("-" if flags[5] == "1" else "+")

        # the read (with margin) contains break point
        if pos_current - search_margin <= int(juncPos1) <= (read.aend -
                                                            1) + search_margin:
            readID2exist[read.qname] = 1

        # the read pair covers break point
        if chr_pair == juncChr1 and pos_current <= int(
                juncPos1
        ) <= pos_pair and dir_current == "+" and dir_pair == "-":
            readID2exist[read.qname] = 1

        # the read pair covers break point
        if chr_pair == juncChr2:
            juncFlag = 0
            if juncDir1 == "+" and juncDir2 == "+" and pos_current <= int(
                    juncPos1) and pos_pair <= int(juncPos2):
                juncFlag = 1
            if juncDir1 == "+" and juncDir2 == "-" and pos_current <= int(
                    juncPos1) and pos_pair >= int(juncPos2):
                juncFlag = 1
            if juncDir1 == "-" and juncDir2 == "+" and pos_current >= int(
                    juncPos1) and pos_pair <= int(juncPos2):
                juncFlag = 1
            if juncDir1 == "-" and juncDir2 == "-" and pos_current >= int(
                    juncPos1) and pos_pair >= int(juncPos2):
                juncFlag = 1

            if juncFlag == 1:
                readID2exist[read.qname] = 1

    for read in bamfile.fetch(juncChr2, max(0,
                                            int(juncPos2) - search_length),
                              int(juncPos2) + search_length):

        if read.qname == "ST-E00104:162:H03UUALXX:5:1222:21168:16006":
            pass

        # get the flag information
        flags = format(int(read.flag), "#014b")[:1:-1]

        # skip unmapped read
        if flags[2] == "1" or flags[3] == "1": continue

        # skip supplementary alignment
        if flags[8] == "1" or flags[11] == "1": continue

        # skip duplicated reads
        if flags[10] == "1": continue

        chr_current = bamfile.getrname(read.tid)
        pos_current = int(read.pos + 1)
        dir_current = ("-" if flags[4] == "1" else "+")
        chr_pair = bamfile.getrname(read.rnext)
        pos_pair = int(read.pnext + 1)
        dir_pair = ("-" if flags[5] == "1" else "+")

        # the read (with margin) contains break point
        if pos_current - search_margin <= int(juncPos2) <= (read.aend -
                                                            1) + search_margin:
            readID2exist[read.qname] = 1

        # the read pair covers break point
        if chr_pair == juncChr2 and pos_current <= int(
                juncPos2
        ) <= pos_pair and dir_current == "+" and dir_pair == "-":
            readID2exist[read.qname] = 1

        # the read pair covers break point
        if chr_pair == juncChr1:
            juncFlag = 0
            if juncDir2 == "+" and juncDir1 == "+" and pos_current <= int(
                    juncPos2) and pos_pair <= int(juncPos1):
                juncFlag = 1
            if juncDir2 == "+" and juncDir1 == "-" and pos_current <= int(
                    juncPos2) and pos_pair >= int(juncPos1):
                juncFlag = 1
            if juncDir2 == "-" and juncDir1 == "+" and pos_current >= int(
                    juncPos2) and pos_pair <= int(juncPos1):
                juncFlag = 1
            if juncDir2 == "-" and juncDir1 == "-" and pos_current >= int(
                    juncPos2) and pos_pair >= int(juncPos1):
                juncFlag = 1

            if juncFlag == 1:
                readID2exist[read.qname] = 1

    readID2seq1 = {}
    readID2seq2 = {}
    complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N'}
    for read in bamfile.fetch(juncChr1, max(0,
                                            int(juncPos1) - search_length),
                              int(juncPos1) + search_length):

        if read.qname in readID2exist:

            # get the flag information
            flags = format(int(read.flag), "#014b")[:1:-1]

            # skip unmapped read
            if flags[2] == "1" or flags[3] == "1": continue

            # skip supplementary alignment
            if flags[8] == "1" or flags[11] == "1": continue

            # skip duplicated reads
            if flags[10] == "1": continue

            tempSeq = ""
            if flags[4] == "1":
                tempSeq = utils.reverseComplement(str(read.seq))
            else:
                tempSeq = read.seq

            # the first read
            if flags[6] == "1":
                readID2seq1[read.qname] = tempSeq
            else:
                readID2seq2[read.qname] = tempSeq

    for read in bamfile.fetch(juncChr2, max(0,
                                            int(juncPos2) - search_length),
                              int(juncPos2) + search_length):

        if read.qname in readID2exist:

            # get the flag information
            flags = format(int(read.flag), "#014b")[:1:-1]

            # skip unmapped read
            if flags[2] == "1" or flags[3] == "1": continue

            # skip supplementary alignment
            if flags[8] == "1" or flags[11] == "1": continue

            # skip duplicated reads
            if flags[10] == "1": continue

            tempSeq = ""
            if flags[4] == "1":
                tempSeq = utils.reverseComplement(str(read.seq))
            else:
                tempSeq = read.seq

            # the first read
            if flags[6] == "1":
                readID2seq1[read.qname] = tempSeq
            else:
                readID2seq2[read.qname] = tempSeq

    for readID in readID2seq1:
        if readID in readID2seq2:
            print >> hOUT, '>' + readID + '/1'
            print >> hOUT, readID2seq1[readID]
            print >> hOUT, '>' + readID + '/2'
            print >> hOUT, readID2seq2[readID]

    bamfile.close()
    hOUT.close()

    return 0
Пример #2
0
def clusterJunction(inputFilePath, outputFilePath, check_margin_size):
    """
        script for merging and summarizing junction read pairs
    """

    hIN = open(inputFilePath, 'r')
    hOUT = open(outputFilePath, 'w')

    mergedBedpeInfo = {}
    mergedJunction = {}
    for line in hIN:

        F = line.rstrip('\n').split('\t')

        match = 0
        delList = []
        for key in sorted(mergedBedpeInfo):

            tchr1, tstart1, tend1, tchr2, tstart2, tend2, tdir1, tdir2, inseqSize = key.split(
                '\t')
            tids, tinseqs, tmqs1, talns1, tmqs2, talns2, tpinds, tcinds = mergedBedpeInfo[
                key].split('\t')

            # the investigated key is sufficiently far from the current line in the input file and no additional line to merge is expected. therefore flush the key and information
            if F[0] != tchr1 or int(F[1]) > int(tend1) + check_margin_size:

                # obtain the most frequent junction
                junc_counter = collections.Counter(
                    mergedJunction[key].split(';'))
                best_junc = junc_counter.most_common(1)[0][0]
                btchr1, btend1, btdir1, btchr2, btend2, btdir2, btinseq = best_junc.split(
                    ',')
                btstart1 = str(int(btend1) - 1)
                btstart2 = str(int(btend2) - 1)


                print >> hOUT, '\t'.join([btchr1, btstart1, btend1, btchr2, btstart2, btend2, \
                                 tids, btinseq, btdir1, btdir2, tmqs1, talns1, \
                                 tmqs2, talns2, tpinds, tcinds]) + '\t' +  \
                      mergedJunction[key]

                # add to the deletion list (later the key will removed from the dictionaries)
                delList.append(key)
                continue

            else:

                # check whether the investigated key and the current line should be merged or not
                if F[0] == tchr1 and F[3] == tchr2 and F[8] == tdir1 and F[
                        9] == tdir2:

                    flag = 0
                    # detailed check on the junction position considering inserted sequences
                    if F[8] == "+":
                        expectedDiffSize = (int(F[2]) - int(tend1)) + (
                            len(F[7]) - int(inseqSize))
                        if (F[9] == "+" and int(F[5])
                                == int(tend2) - int(expectedDiffSize)) or (
                                    F[9] == "-" and int(F[5])
                                    == int(tend2) + int(expectedDiffSize)):
                            flag = 1
                    else:
                        expectedDiffSize = (int(F[2]) - int(tend1)) + (
                            int(inseqSize) - len(F[7]))
                        if (F[9] == "+" and int(F[5])
                                == int(tend2) + int(expectedDiffSize)) or (
                                    F[9] == "-" and int(F[5])
                                    == int(tend2) - int(expectedDiffSize)):
                            flag = 1

                    # if the junction position and direciton match
                    if flag == 1:

                        match = 1
                        newIds = tids + ';' + F[6]
                        newInseqs = tinseqs + ';' + F[7]
                        newMqs1 = tmqs1 + ';' + F[10]
                        newAlns1 = talns1 + ';' + F[11]
                        newMqs2 = tmqs2 + ';' + F[12]
                        newAlns2 = talns2 + ';' + F[13]
                        newPinds = tpinds + ';' + F[14]
                        newCinds = tcinds + ';' + F[15]

                        mergedBedpeInfo[key] = '\t'.join([
                            newIds, newInseqs, newMqs1, newAlns1, newMqs2,
                            newAlns2, newPinds, newCinds
                        ])

                        # check whether the inserted sequence should be reverse-complemented
                        tinseq = F[7]
                        if F[7] != "---" and F[8] == F[9] and F[15] == "2":
                            # tinseq = str(Bio.Seq.Seq(F[7]).reverse_complement())
                            tinseq = utils.reverseComplement(F[7])

                        mergedJunction[
                            key] = mergedJunction[key] + ";" + ','.join(
                                [F[0], F[2], F[8], F[3], F[5], F[9], tinseq])

        for item in delList:
            del mergedBedpeInfo[item]
            del mergedJunction[item]

        # if the current line in the input file does not match any of the pooled keys
        if match == 0:
            newKey = '\t'.join([
                F[0], F[1], F[2], F[3], F[4], F[5], F[8], F[9],
                str(len(F[7]))
            ])
            mergedBedpeInfo[newKey] = F[6] + '\t' + F[7] + '\t' + '\t'.join(
                F[10:16])

            # check whether the inserted sequence should be reverse-complemented
            tinseq = F[7]
            if F[7] != "---" and F[8] == F[9] and F[15] == "2":
                # tinseq = str(Bio.Seq.Seq(F[7]).reverse_complement())
                tinseq = utils.reverseComplement(F[7])

            mergedJunction[newKey] = ','.join(
                [F[0], F[2], F[8], F[3], F[5], F[9], tinseq])

    hIN.close()

    # last treatment
    for key in sorted(mergedBedpeInfo):

        tchr1, tstart1, tend1, tchr2, tstart2, tend2, tdir1, tdir2, inseqSize = key.split(
            '\t')
        tids, tinseqs, tmqs1, talns1, tmqs2, talns2, tpinds, tcinds = mergedBedpeInfo[
            key].split('\t')

        # obtain the most frequent junction
        junc_counter = collections.Counter(mergedJunction[key].split(';'))
        best_junc = junc_counter.most_common(1)[0][0]
        btchr1, btend1, btdir1, btchr2, btend2, btdir2, btinseq = best_junc.split(
            ',')
        btstart1 = str(int(btend1) - 1)
        btstart2 = str(int(btend2) - 1)

        print >> hOUT, '\t'.join([btchr1, btstart1, btend1, btchr2, btstart2, btend2, \
                         tids, btinseq, btdir1, btdir2, tmqs1, talns1, \
                         tmqs2, talns2, tpinds, tcinds]) + '\t' +  \
              mergedJunction[key]

    hOUT.close()
Пример #3
0
def getRefAltForSV(outputFilePath, juncChr1, juncPos1, juncDir1, juncChr2,
                   juncPos2, juncDir2, juncSeq, reference_genome,
                   split_refernece_thres, validate_sequence_length):
    """
        for short SV (mid-range (<= split_refernece_thres bp) deletion, tandem duplication), we get the two sequence
        for large SV (> split_refernece_thres bp), we get three sequence (one joint sequence by two break points, and two reference sequences around the break points)

        the only concern is short inversion... (are there some somatic short inversion?)
        however, this will be filtered beforehand by the "cover filter", and maybe we have to give up detecting these class of SVs.

    """

    hOUT = open(outputFilePath, 'w')

    if juncSeq == "---": juncSeq = ""

    # for mid-range deletion or tandem duplication
    if juncChr1 == juncChr2 and abs(int(juncPos1) - int(
            juncPos2)) <= split_refernece_thres and juncDir1 != juncDir2:

        seq = ""
        for item in pysam.faidx(
                reference_genome, juncChr1 + ":" +
                str(int(juncPos1) - validate_sequence_length) + "-" +
                str(int(juncPos2) + validate_sequence_length)):
            if item[0] == ">": continue
            seq = seq + item.rstrip('\n').upper()

        print >> hOUT, '>' + ','.join([
            juncChr1,
            str(juncPos1), juncDir1, juncChr2,
            str(juncPos2), juncDir2
        ]) + "_ref"
        print >> hOUT, seq

        # for mid-range deletion
        if juncDir1 == "+" and juncDir2 == "-":

            seq = ""
            for item in pysam.faidx(
                    reference_genome, juncChr1 + ":" +
                    str(int(juncPos1) - validate_sequence_length) + "-" +
                    str(juncPos1)):
                if item[0] == ">": continue
                seq = seq + item.rstrip('\n').upper()

            seq = seq + juncSeq

            for item in pysam.faidx(
                    reference_genome, juncChr2 + ":" + str(juncPos2) + "-" +
                    str(int(juncPos2) + validate_sequence_length)):
                if item[0] == ">": continue
                seq = seq + item.rstrip('\n').upper()

            print >> hOUT, '>' + ','.join([
                juncChr1,
                str(juncPos1), juncDir1, juncChr2,
                str(juncPos2), juncDir2
            ]) + "_alt"
            print >> hOUT, seq

        # for mid-range tandem duplication
        else:
            seq = ""
            for item in pysam.faidx(
                    reference_genome, juncChr2 + ":" +
                    str(int(juncPos2) - validate_sequence_length) + "-" +
                    str(juncPos2)):
                if item[0] == ">": continue
                seq = seq + item.rstrip('\n').upper()

            seq = seq + juncSeq

            for item in pysam.faidx(
                    reference_genome, juncChr1 + ":" + str(juncPos1) + "-" +
                    str(int(juncPos1) + validate_sequence_length)):
                if item[0] == ">": continue
                seq = seq + item.rstrip('\n').upper()

            print >> hOUT, '>' + ','.join([
                juncChr1,
                str(juncPos1), juncDir1, juncChr2,
                str(juncPos2), juncDir2
            ]) + "_alt"
            print >> hOUT, seq

    else:

        seq = ""
        for item in pysam.faidx(
                reference_genome, juncChr1 + ":" +
                str(int(juncPos1) - validate_sequence_length) + "-" +
                str(int(juncPos1) + validate_sequence_length)):
            if item[0] == ">": continue
            seq = seq + item.rstrip('\n').upper()

        print >> hOUT, '>' + ','.join([
            juncChr1,
            str(juncPos1), juncDir1, juncChr2,
            str(juncPos2), juncDir2
        ]) + "_ref1"
        print >> hOUT, seq

        seq = ""
        for item in pysam.faidx(
                reference_genome, juncChr2 + ":" +
                str(int(juncPos2) - validate_sequence_length) + "-" +
                str(int(juncPos2) + validate_sequence_length)):
            if item[0] == ">": continue
            seq = seq + item.rstrip('\n').upper()

        print >> hOUT, '>' + ','.join([
            juncChr1,
            str(juncPos1), juncDir1, juncChr2,
            str(juncPos2), juncDir2
        ]) + "_ref2"
        print >> hOUT, seq

        seq = ""
        if juncDir1 == "+":
            tseq = ""
            for item in pysam.faidx(
                    reference_genome, juncChr1 + ":" +
                    str(int(juncPos1) - validate_sequence_length) + "-" +
                    str(juncPos1)):
                if item[0] == ">": continue
                tseq = tseq + item.rstrip('\n').upper()
        else:
            tseq = ""
            for item in pysam.faidx(
                    reference_genome, juncChr1 + ":" + str(juncPos1) + "-" +
                    str(int(juncPos1) + validate_sequence_length)):
                if item[0] == ">": continue
                tseq = tseq + item.rstrip('\n').upper()
            tseq = utils.reverseComplement(tseq)

        seq = tseq + juncSeq

        if juncDir2 == "-":
            tseq = ""
            for item in pysam.faidx(
                    reference_genome, juncChr2 + ":" + str(juncPos2) + "-" +
                    str(int(juncPos2) + validate_sequence_length)):
                if item[0] == ">": continue
                tseq = tseq + item.rstrip('\n').upper()
        else:
            tseq = ""
            for item in pysam.faidx(
                    reference_genome, juncChr2 + ":" +
                    str(int(juncPos2) - validate_sequence_length) + "-" +
                    str(juncPos2)):
                if item[0] == ">": continue
                tseq = tseq + item.rstrip('\n').upper()
            tseq = utils.reverseComplement(tseq)

        seq = seq + tseq

        print >> hOUT, '>' + ','.join([
            juncChr1,
            str(juncPos1), juncDir1, juncChr2,
            str(juncPos2), juncDir2
        ]) + "_alt"
        print >> hOUT, seq

    hOUT.close()
Пример #4
0
def extractSVReadPairs(bamFilePath, outputFilePath, juncChr1, juncPos1, juncDir1, juncChr2, juncPos2, juncDir2, max_depth, search_length, search_margin):

    """
        read pairs containing break points are extracted. (yshira 2015/04/23)
        The exact condition is as follows:

        1. one of the read in the pair has the break point of the SV candidate
        2. the start positions of the read pairs are within 800bp of the break point of the SV candidate 

        Some minor concern for the above conditions are:
        1. Depending on the choice of the "start position" or "end position", the distance between the read and break point differs. This can generate slight bias...
        (but I believe we can recover this by setting sufficient margin (800bp), and summarize the alignment result carefully.)
        2. Maybe, for some of the read pair, the result of alignment is obvious. But should we re-align them?

    """

    bamfile = pysam.Samfile(bamFilePath, 'rb')

    # if the #sequence read is over the `maxDepth`, then that key is ignored
    depthFlag = 0
    if bamfile.count(juncChr1, int(juncPos1) - 1, int(juncPos1) + 1) >= max_depth: depthFlag = 1
    if bamfile.count(juncChr2, int(juncPos2) - 1, int(juncPos2) + 1) >= max_depth: depthFlag = 1
    if depthFlag == 1:
        print >> sys.stderr, "sequence depth exceeds the threshould for: " + ','.join([juncChr1, juncPos1, juncDir1, juncChr2, juncPos2, juncDir2]) 
        return 1 

    hOUT = open(outputFilePath, 'w')

    readID2exist = {}    
    for read in bamfile.fetch(juncChr1, max(0, int(juncPos1) - search_length), int(juncPos1) + search_length):

        # get the flag information
        flags = format(int(read.flag), "#014b")[:1:-1]

        # skip unmapped read 
        if flags[2] == "1" or flags[3] == "1": continue 

        # skip supplementary alignment
        if flags[8] == "1" or flags[11] == "1": continue

        # skip duplicated reads
        if flags[10] == "1": continue

        chr_current = bamfile.getrname(read.tid)
        pos_current = int(read.pos + 1)
        dir_current = ("-" if flags[4] == "1" else "+")
        chr_pair = bamfile.getrname(read.rnext)
        pos_pair = int(read.pnext + 1)
        dir_pair = ("-" if flags[5] == "1" else "+")

        # the read (with margin) contains break point
        if pos_current - search_margin <= int(juncPos1) <= (read.aend - 1) + search_margin:
            readID2exist[read.qname] = 1
    
        # the read pair covers break point
        if chr_pair == juncChr1 and pos_current <= int(juncPos1) <= pos_pair and dir_current == "+" and dir_pair == "-":
            readID2exist[read.qname] = 1

        # the read pair covers break point
        if chr_pair == juncChr2:
            juncFlag = 0
            if juncDir1 == "+" and juncDir2 == "+" and pos_current <= int(juncPos1) and pos_pair <= int(juncPos2): juncFlag = 1
            if juncDir1 == "+" and juncDir2 == "-" and pos_current <= int(juncPos1) and pos_pair >= int(juncPos2): juncFlag = 1
            if juncDir1 == "-" and juncDir2 == "+" and pos_current >= int(juncPos1) and pos_pair <= int(juncPos2): juncFlag = 1
            if juncDir1 == "-" and juncDir2 == "-" and pos_current >= int(juncPos1) and pos_pair >= int(juncPos2): juncFlag = 1

            if juncFlag == 1:  
                readID2exist[read.qname] = 1


    for read in bamfile.fetch(juncChr2, max(0, int(juncPos2) - search_length), int(juncPos2) + search_length):
        
        if read.qname == "ST-E00104:162:H03UUALXX:5:1222:21168:16006":
            pass
 
        # get the flag information
        flags = format(int(read.flag), "#014b")[:1:-1]

        # skip unmapped read 
        if flags[2] == "1" or flags[3] == "1": continue
        
        # skip supplementary alignment
        if flags[8] == "1" or flags[11] == "1": continue
        
        # skip duplicated reads
        if flags[10] == "1": continue
        
        chr_current = bamfile.getrname(read.tid)
        pos_current = int(read.pos + 1)
        dir_current = ("-" if flags[4] == "1" else "+")
        chr_pair = bamfile.getrname(read.rnext)
        pos_pair = int(read.pnext + 1)
        dir_pair = ("-" if flags[5] == "1" else "+")

        # the read (with margin) contains break point
        if pos_current - search_margin <= int(juncPos2) <= (read.aend - 1) + search_margin:
            readID2exist[read.qname] = 1
                
        # the read pair covers break point
        if chr_pair == juncChr2 and pos_current <= int(juncPos2) <= pos_pair and dir_current == "+" and dir_pair == "-":
            readID2exist[read.qname] = 1
                
        # the read pair covers break point
        if chr_pair == juncChr1:
            juncFlag = 0
            if juncDir2 == "+" and juncDir1 == "+" and pos_current <= int(juncPos2) and pos_pair <= int(juncPos1): juncFlag = 1
            if juncDir2 == "+" and juncDir1 == "-" and pos_current <= int(juncPos2) and pos_pair >= int(juncPos1): juncFlag = 1
            if juncDir2 == "-" and juncDir1 == "+" and pos_current >= int(juncPos2) and pos_pair <= int(juncPos1): juncFlag = 1
            if juncDir2 == "-" and juncDir1 == "-" and pos_current >= int(juncPos2) and pos_pair >= int(juncPos1): juncFlag = 1
             
            if juncFlag == 1:
                readID2exist[read.qname] = 1


    readID2seq1 = {}
    readID2seq2 = {}
    complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N'}
    for read in bamfile.fetch(juncChr1, max(0, int(juncPos1) - search_length), int(juncPos1) + search_length):

        if read.qname in readID2exist:
        
            # get the flag information
            flags = format(int(read.flag), "#014b")[:1:-1]

            # skip unmapped read 
            if flags[2] == "1" or flags[3] == "1": continue

            # skip supplementary alignment
            if flags[8] == "1" or flags[11] == "1": continue

            # skip duplicated reads
            if flags[10] == "1": continue

            tempSeq = ""
            if flags[4] == "1":
                tempSeq = utils.reverseComplement(str(read.seq))
            else:
                tempSeq = read.seq
 
            # the first read
            if flags[6] == "1":
                readID2seq1[read.qname] = tempSeq
            else:
                readID2seq2[read.qname] = tempSeq


    for read in bamfile.fetch(juncChr2, max(0, int(juncPos2) - search_length), int(juncPos2) + search_length):

        if read.qname in readID2exist:

            # get the flag information
            flags = format(int(read.flag), "#014b")[:1:-1]

            # skip unmapped read 
            if flags[2] == "1" or flags[3] == "1": continue

            # skip supplementary alignment
            if flags[8] == "1" or flags[11] == "1": continue
            
            # skip duplicated reads
            if flags[10] == "1": continue

            tempSeq = ""
            if flags[4] == "1":
                tempSeq = utils.reverseComplement(str(read.seq))
            else:
                tempSeq = read.seq

            # the first read
            if flags[6] == "1":
                readID2seq1[read.qname] = tempSeq
            else:
                readID2seq2[read.qname] = tempSeq


    for readID in readID2seq1:
        if readID in readID2seq2:
            print >> hOUT, '>' + readID + '/1'
            print >> hOUT, readID2seq1[readID]
            print >> hOUT, '>' + readID + '/2'
            print >> hOUT, readID2seq2[readID]

    bamfile.close()
    hOUT.close()

    return 0
Пример #5
0
def getRefAltForSV(outputFilePath, juncChr1, juncPos1, juncDir1, juncChr2, juncPos2, juncDir2, juncSeq, reference_genome, split_refernece_thres, validate_sequence_length):

    """
        for short SV (mid-range (<= split_refernece_thres bp) deletion, tandem duplication), we get the two sequence
        for large SV (> split_refernece_thres bp), we get three sequence (one joint sequence by two break points, and two reference sequences around the break points)

        the only concern is short inversion... (are there some somatic short inversion?)
        however, this will be filtered beforehand by the "cover filter", and maybe we have to give up detecting these class of SVs.

    """

    hOUT = open(outputFilePath, 'w')

    if juncSeq == "---": juncSeq = ""

    # for mid-range deletion or tandem duplication
    if juncChr1 == juncChr2 and abs(int(juncPos1) - int(juncPos2)) <= split_refernece_thres and juncDir1 != juncDir2:

        seq = ""
        for item in pysam.faidx(reference_genome, juncChr1 + ":" + str(int(juncPos1) - validate_sequence_length) + "-" + str(int(juncPos2) + validate_sequence_length)):
            if item[0] == ">": continue
            seq = seq + item.rstrip('\n').upper()

        print >> hOUT, '>' + ','.join([juncChr1, str(juncPos1), juncDir1, juncChr2, str(juncPos2), juncDir2]) + "_ref"
        print >> hOUT, seq

        # for mid-range deletion
        if juncDir1 == "+" and juncDir2 == "-":

            seq = ""
            for item in pysam.faidx(reference_genome, juncChr1 + ":" + str(int(juncPos1) - validate_sequence_length) + "-" + str(juncPos1)):
                if item[0] == ">": continue
                seq = seq + item.rstrip('\n').upper()

            seq = seq + juncSeq

            for item in pysam.faidx(reference_genome, juncChr2 + ":" + str(juncPos2) + "-" + str(int(juncPos2) + validate_sequence_length)):
                if item[0] == ">": continue
                seq = seq + item.rstrip('\n').upper()

            print >> hOUT, '>' + ','.join([juncChr1, str(juncPos1), juncDir1, juncChr2, str(juncPos2), juncDir2]) + "_alt"
            print >> hOUT, seq

        # for mid-range tandem duplication
        else:
            seq = "" 
            for item in pysam.faidx(reference_genome, juncChr2 + ":" + str(int(juncPos2) - validate_sequence_length) + "-" + str(juncPos2)):
                if item[0] == ">": continue
                seq = seq + item.rstrip('\n').upper()
            
            seq = seq + juncSeq

            for item in pysam.faidx(reference_genome, juncChr1 + ":" + str(juncPos1) + "-" + str(int(juncPos1) + validate_sequence_length)):
                if item[0] == ">": continue
                seq = seq + item.rstrip('\n').upper()
            
            print >> hOUT, '>' + ','.join([juncChr1, str(juncPos1), juncDir1, juncChr2, str(juncPos2), juncDir2]) + "_alt"
            print >> hOUT, seq


    else:

        seq = ""
        for item in pysam.faidx(reference_genome, juncChr1 + ":" + str(int(juncPos1) - validate_sequence_length) + "-" + str(int(juncPos1) + validate_sequence_length)):
            if item[0] == ">": continue
            seq = seq + item.rstrip('\n').upper()

        print >> hOUT, '>' + ','.join([juncChr1, str(juncPos1), juncDir1, juncChr2, str(juncPos2), juncDir2]) + "_ref1"
        print >> hOUT, seq

        seq = ""
        for item in pysam.faidx(reference_genome, juncChr2 + ":" + str(int(juncPos2) - validate_sequence_length) + "-" + str(int(juncPos2) + validate_sequence_length)):
            if item[0] == ">": continue
            seq = seq + item.rstrip('\n').upper()
            
        print >> hOUT, '>' + ','.join([juncChr1, str(juncPos1), juncDir1, juncChr2, str(juncPos2), juncDir2]) + "_ref2"
        print >> hOUT, seq


        seq = ""
        if juncDir1 == "+":
            tseq = ""
            for item in pysam.faidx(reference_genome, juncChr1 + ":" + str(int(juncPos1) - validate_sequence_length) + "-" + str(juncPos1)):
                if item[0] == ">": continue
                tseq = tseq + item.rstrip('\n').upper()
        else:
            tseq = ""
            for item in pysam.faidx(reference_genome, juncChr1 + ":" + str(juncPos1) + "-" + str(int(juncPos1) + validate_sequence_length)):
                if item[0] == ">": continue
                tseq = tseq + item.rstrip('\n').upper()
            tseq = utils.reverseComplement(tseq)

        seq = tseq + juncSeq

        if juncDir2 == "-":
            tseq = "" 
            for item in pysam.faidx(reference_genome, juncChr2 + ":" + str(juncPos2) + "-" + str(int(juncPos2) + validate_sequence_length)):
                if item[0] == ">": continue
                tseq = tseq + item.rstrip('\n').upper()
        else:
            tseq = ""
            for item in pysam.faidx(reference_genome, juncChr2 + ":" + str(int(juncPos2) - validate_sequence_length) + "-" + str(juncPos2)):
                if item[0] == ">": continue
                tseq = tseq + item.rstrip('\n').upper()
            tseq = utils.reverseComplement(tseq)

        seq = seq + tseq

        print >> hOUT, '>' + ','.join([juncChr1, str(juncPos1), juncDir1, juncChr2, str(juncPos2), juncDir2]) + "_alt"
        print >> hOUT, seq

    
    hOUT.close()
Пример #6
0
def clusterJunction(inputFilePath, outputFilePath, check_margin_size):

    """
        script for merging and summarizing junction read pairs
    """
 
    hIN = open(inputFilePath, 'r')
    hOUT = open(outputFilePath, 'w')


    mergedBedpeInfo = {}
    mergedJunction = {}
    for line in hIN:

        F = line.rstrip('\n').split('\t')

        match = 0
        delList = []
        for key in sorted(mergedBedpeInfo):

            tchr1, tstart1, tend1, tchr2, tstart2, tend2, tdir1, tdir2, inseqSize = key.split('\t')
            tids, tinseqs, tmqs1, talns1, tmqs2, talns2, tpinds, tcinds = mergedBedpeInfo[key].split('\t') 

            # the investigated key is sufficiently far from the current line in the input file and no additional line to merge is expected. therefore flush the key and information
            if F[0] != tchr1 or int(F[1]) > int(tend1) + check_margin_size:

                # obtain the most frequent junction
                junc_counter = collections.Counter(mergedJunction[key].split(';'))
                best_junc = junc_counter.most_common(1)[0][0]
                btchr1, btend1, btdir1, btchr2, btend2, btdir2, btinseq = best_junc.split(',') 
                btstart1 = str(int(btend1) - 1)
                btstart2 = str(int(btend2) - 1)


                print >> hOUT, '\t'.join([btchr1, btstart1, btend1, btchr2, btstart2, btend2, \
                                 tids, btinseq, btdir1, btdir2, tmqs1, talns1, \
                                 tmqs2, talns2, tpinds, tcinds]) + '\t' +  \
                      mergedJunction[key]

                # add to the deletion list (later the key will removed from the dictionaries)
                delList.append(key)
                continue

            else:

                # check whether the investigated key and the current line should be merged or not 
                if F[0] == tchr1 and F[3] == tchr2 and F[8] == tdir1 and F[9] == tdir2:

                    flag = 0
                    # detailed check on the junction position considering inserted sequences
                    if F[8] == "+":
                        expectedDiffSize = (int(F[2]) - int(tend1)) + (len(F[7]) - int(inseqSize))
                        if (F[9] == "+" and int(F[5]) == int(tend2) - int(expectedDiffSize)) or (F[9] == "-" and int(F[5]) == int(tend2) + int(expectedDiffSize)):
                            flag = 1
                    else:
                        expectedDiffSize = (int(F[2]) - int(tend1)) + (int(inseqSize) - len(F[7]))
                        if (F[9] == "+" and int(F[5]) == int(tend2) + int(expectedDiffSize)) or (F[9] == "-" and int(F[5]) == int(tend2) - int(expectedDiffSize)):
                            flag = 1

                    # if the junction position and direciton match
                    if flag == 1:
                
                        match = 1
                        newIds = tids + ';' + F[6]
                        newInseqs = tinseqs + ';' + F[7]
                        newMqs1 = tmqs1 + ';' + F[10]
                        newAlns1 = talns1 + ';' + F[11]
                        newMqs2 = tmqs2 + ';' + F[12]
                        newAlns2 = talns2 + ';' + F[13]
                        newPinds = tpinds + ';' + F[14]
                        newCinds = tcinds + ';' + F[15]


                        mergedBedpeInfo[key] = '\t'.join([newIds, newInseqs, newMqs1, newAlns1, newMqs2, newAlns2, newPinds, newCinds])

                        # check whether the inserted sequence should be reverse-complemented 
                        tinseq = F[7]
                        if F[7] != "---" and F[8] == F[9] and F[15] == "2":
                            # tinseq = str(Bio.Seq.Seq(F[7]).reverse_complement())
                            tinseq = utils.reverseComplement(F[7])

                        mergedJunction[key] = mergedJunction[key] + ";" + ','.join([F[0], F[2], F[8], F[3], F[5], F[9], tinseq])

        for item in delList:
            del mergedBedpeInfo[item]
            del mergedJunction[item]

        # if the current line in the input file does not match any of the pooled keys
        if match == 0:
            newKey = '\t'.join([F[0], F[1], F[2], F[3], F[4], F[5], F[8], F[9], str(len(F[7]))])
            mergedBedpeInfo[newKey] = F[6] + '\t' + F[7] + '\t' + '\t'.join(F[10:16])

            # check whether the inserted sequence should be reverse-complemented
            tinseq = F[7] 
            if F[7] != "---" and F[8] == F[9] and F[15] == "2":
                # tinseq = str(Bio.Seq.Seq(F[7]).reverse_complement())
                tinseq = utils.reverseComplement(F[7])

            mergedJunction[newKey] = ','.join([F[0], F[2], F[8], F[3], F[5], F[9], tinseq])

    hIN.close()

    # last treatment
    for key in sorted(mergedBedpeInfo):

        tchr1, tstart1, tend1, tchr2, tstart2, tend2, tdir1, tdir2, inseqSize = key.split('\t')
        tids, tinseqs, tmqs1, talns1, tmqs2, talns2, tpinds, tcinds = mergedBedpeInfo[key].split('\t')

        # obtain the most frequent junction
        junc_counter = collections.Counter(mergedJunction[key].split(';'))
        best_junc = junc_counter.most_common(1)[0][0]
        btchr1, btend1, btdir1, btchr2, btend2, btdir2, btinseq = best_junc.split(',')
        btstart1 = str(int(btend1) - 1)
        btstart2 = str(int(btend2) - 1)

        print >> hOUT, '\t'.join([btchr1, btstart1, btend1, btchr2, btstart2, btend2, \
                         tids, btinseq, btdir1, btdir2, tmqs1, talns1, \
                         tmqs2, talns2, tpinds, tcinds]) + '\t' +  \
              mergedJunction[key]

    hOUT.close()
Пример #7
0
    'TTA': 'L',
    'TTG': 'L',
    'TAC': 'Y',
    'TAT': 'Y',
    'TAA': '*',
    'TAG': '*',
    'TGC': 'C',
    'TGT': 'C',
    'TGA': '*',
    'TGG': 'W',
}

for header, sequence in utils.parse_fasta(sequenceFile):
    positiveStrand = ""
    longestCDS = 0
    strands = [sequence, utils.reverseComplement(sequence)]
    for strand in strands:
        for frame in range(3):
            proteinSequence = ""
            for fragment in range(frame, len(strand), 3):
                codon = strand[fragment:fragment + 3]
                if len(codon) != 3:
                    continue
                try:
                    proteinSequence += codon2aminoacid[codon]
                except KeyError:
                    proteinSequence += 'X'
            matches = regex_orf.findall(proteinSequence)
            allORFs = "".join([x for x in matches if x])
            if len(allORFs) / float(len(strand)) > longestCDS:
                longestCDS = len(allORFs) / float(len(strand))