示例#1
0
def polyA_len(sequence):
    '''
    Compute poly(A) length 
    '''
    ## Configuration for monomere search:
    windowSize = 8
    maxWindowDist = 2
    minMonomerSize = 10
    minPurity = 80

    ## Seach poly(A) monomers
    targetMonomer = 'A'
    monomersA = sequences.find_monomers(sequence, targetMonomer, windowSize,
                                        maxWindowDist, minMonomerSize,
                                        minPurity)

    if not monomersA:
        return 0

    ## Select monomer closest to sequence end
    candidate = monomersA[-1]

    ## Filter out monomer if more than Xbp from end
    seqLen = len(sequence)
    dist2end = seqLen - candidate.end

    if dist2end <= 30:
        polyAlen = candidate.end - candidate.beg

    else:
        polyAlen = 0

    return polyAlen
示例#2
0
def trim_polyA(sequence):
    '''
    Trim poly(A) at sequence end 
    '''
    ## Configuration for monomere search:
    windowSize = 8
    maxWindowDist = 2
    minMonomerSize = 10
    minPurity = 80

    ## Seach poly(A) monomers
    targetMonomer = 'A'
    monomersA = sequences.find_monomers(sequence, targetMonomer, windowSize,
                                        maxWindowDist, minMonomerSize,
                                        minPurity)

    if not monomersA:
        return sequence

    ## Select monomer closest to sequence end
    candidate = monomersA[-1]

    ## Filter out monomer if more than Xbp from end
    seqLen = len(sequence)
    dist2end = seqLen - candidate.end

    if dist2end <= 30:
        sequence = sequence[:candidate.beg]

    else:
        sequence = sequence

    return sequence
示例#3
0
def search4polyT(sequence):
    '''
    Search for poly(T) at sequence begin 
    '''
    ## Configuration for monomere search:
    windowSize = 8
    maxWindowDist = 2
    minMonomerSize = 10
    minPurity = 80

    ## Seach poly(T) monomers
    targetMonomer = 'T'
    monomersT = sequences.find_monomers(sequence, targetMonomer, windowSize,
                                        maxWindowDist, minMonomerSize,
                                        minPurity)

    if not monomersT:
        return False, None

    ## Select monomer closest to sequence beg
    candidate = monomersT[0]

    ## Filter out monomer if more than Xbp from beg
    dist2beg = candidate.beg

    if dist2beg <= 30:
        polyT = True

    else:
        polyT = False

    return polyT, candidate
示例#4
0
def search4polyA(sequence):
    '''
    Search for poly(A) at sequence end 
    '''
    ## Configuration for monomere search:
    windowSize = 8
    maxWindowDist = 2
    minMonomerSize = 10
    minPurity = 80

    ## Seach poly(A) monomers
    targetMonomer = 'A'
    monomersA = sequences.find_monomers(sequence, targetMonomer, windowSize,
                                        maxWindowDist, minMonomerSize,
                                        minPurity)

    if not monomersA:
        return False, None

    ## Select monomer closest to sequence end
    candidate = monomersA[-1]

    ## Filter out monomer if more than Xbp from end
    seqLen = len(sequence)
    dist2end = seqLen - candidate.end

    if dist2end <= 30:
        polyA = True

    else:
        polyA = False

    return polyA, candidate
示例#5
0
def has_polyA_illumina(targetSeq):
    '''
    Search for polyA/polyT tails in consensus sequence of Illumina clipping events
    
    Input:
    1. targetSeq: consensus sequence of Illumina clipping events
    
    Output:
    1. has_polyA: True or False
    '''

    ## 0. Set up monomer searching parameters ##
    windowSize = 8
    maxWindowDist = 2
    minMonomerSize = 8
    minPurity = 95
    maxDist2Ends = 1

    monomerTails = []

    ## 1. Search for polyA/polyT at the sequence ends ##
    targetMonomers = ['T', 'A']

    for targetMonomer in targetMonomers:

        monomers = sequences.find_monomers(targetSeq, targetMonomer,
                                           windowSize, maxWindowDist,
                                           minMonomerSize, minPurity)
        filtMonomers = sequences.filter_internal_monomers(
            monomers, targetSeq, maxDist2Ends, minMonomerSize)
        monomerTails += filtMonomers if filtMonomers is not None else monomerTails

    while [] in monomerTails:
        monomerTails.remove([])
    has_polyA = True if monomerTails != [] else False

    return has_polyA
示例#6
0
def trim_3prime_td(sequence):
    '''
    Trim 3 prime transduction at sequence end
    '''
    ## Configuration for monomere search:
    windowSize = 8
    maxWindowDist = 2
    minMonomerSize = 10
    minPurity = 80

    ## Seach poly(A) monomers
    targetMonomer = 'A'
    monomersA = sequences.find_monomers(sequence, targetMonomer, windowSize,
                                        maxWindowDist, minMonomerSize,
                                        minPurity)

    if not monomersA:
        return sequence

    ## Select second closest monomer to the sequence end
    monomerA = monomersA[-2:][0]
    trimmed = sequence[:monomerA.beg]

    return trimmed
示例#7
0
def infer_strand_polyA(sequence, chain):
    '''
    Infer insertion strand based on two criteria:
        1) Location of polyA/T tail at sequence ends
        2) Alignment strand for the insert 3' end over the template sequence

    Input: 
        1. sequence: consensus inserted sequence
        2. chain: Sequence chain of alignments over retrotranposon consensus sequences and/or transduced regions

    Output:
        1. strand: Insertion strand (+, - or None) 
        2. polyA: boolean specifying if polyA/T sequence was found
    '''
    ### Set up configuration parameters
    windowSize = 8
    maxWindowDist = 2
    minMonomerSize = 10
    minPurity = 80

    maxDist2Ends = 10
    minInternalMonomerSize = 20

    ## 1. Search for polyA at the insert 3' end ##
    # 1.1 Extract unaligned 3' end of the inserted sequence
    lastHit = chain.alignments[-1]
    targetSeq = sequence[lastHit.qEnd:]

    # 1.2 Search for poly(A) monomers on the 3' end
    targetMonomer = 'A'
    monomers3end = sequences.find_monomers(targetSeq, targetMonomer,
                                           windowSize, maxWindowDist,
                                           minMonomerSize, minPurity)

    # 1.3 Filter internal monomers
    monomers3end = sequences.filter_internal_monomers(monomers3end, targetSeq,
                                                      maxDist2Ends,
                                                      minInternalMonomerSize)

    ## 2. Search for polyT at the insert 5' end ##
    # 2.1 Extract unaligned 5' end of the inserted sequence
    firstHit = chain.alignments[0]
    targetSeq = sequence[:firstHit.qBeg]

    # 2.2 Search for poly(T) monomers on the 5' end
    targetMonomer = 'T'
    monomers5end = sequences.find_monomers(targetSeq, targetMonomer,
                                           windowSize, maxWindowDist,
                                           minMonomerSize, minPurity)

    # 2.3 Filter internal monomers
    monomers5end = sequences.filter_internal_monomers(monomers5end, targetSeq,
                                                      maxDist2Ends,
                                                      minInternalMonomerSize)

    ## 3. Determine strand ##
    # 3.1 Compute 3' monomers accumulative length
    monomers3endLengths = [monomer.length() for monomer in monomers3end]
    accumulative3prime = sum(monomers3endLengths)

    # 3.2 Compute 5' monomers accumulative length
    monomers5endLengths = [monomer.length() for monomer in monomers5end]
    accumulative5prime = sum(monomers5endLengths)

    # 3.3 Determine if polyA/T at 5' or 3' end (indicative of strand orientation)
    # a) Unknown strand if:
    # - No monomer found in any end OR
    # - Ambiguity as 3' and 5' monomers with equal size
    if ((accumulative3prime == 0) and
        (accumulative5prime == 0)) or (accumulative3prime
                                       == accumulative5prime):
        monomers = None
        strand = None
        polyA = False

    # b) Positive strand
    elif accumulative3prime > accumulative5prime:
        monomers = monomers3end
        strand = '+'
        polyA = True

    # c) Negative strand
    else:
        monomers = monomers5end
        strand = '-'
        polyA = True

    ## 4. Convert monomer coordinates to inserted sequence space ##
    ## a) + strand
    # -----insert------**unaligned**
    if strand == '+':

        for monomer in monomers:
            monomer.beg = monomer.beg + lastHit.qEnd
            monomer.end = monomer.end + lastHit.qEnd

    ## b) - strand
    # NOT NEEDED as the unaligned sequence correspond to the leftmost end of the insert
    # **unaligned**-----insert------

    ## 5. Add polyA/T to the chain of alignments
    firstAlignment = chain.alignments[0]

    ## a) + strand
    if strand == '+':

        # For each monomer
        for monomer in monomers:

            # Create PAF line containing poly(A) info
            fields = [
                firstAlignment.qName, firstAlignment.qLen, monomer.beg,
                monomer.end, None, 'poly(A/T)', 0, 0, 0, 0, 0, 0
            ]
            alignment = formats.PAF_alignment(fields)

            # Add to the end of the chain
            chain.alignments.append(alignment)

    ## b) - strand
    elif strand == '-':

        # For each monomer
        for monomer in monomers[::-1]:

            # Create PAF line containing poly(T) info
            fields = [
                firstAlignment.qName, firstAlignment.qLen, monomer.beg,
                monomer.end, None, 'poly(A/T)', 0, 0, 0, 0, 0, 0
            ]
            alignment = formats.PAF_alignment(fields)

            # Add to the begin of the chain
            chain.alignments.insert(0, alignment)

    return strand, polyA
示例#8
0
def MEI_structure(PAF, insertSeq):
    '''
    '''
    structure = {}
    structure['LEN'] = len(insertSeq)

    ## 1. Chain alignments
    structure['CHAIN'] = PAF.chain(20, 50)

    ## 2. Determine insertion family
    families = list(
        set([hit.tName.split('|')[1]
             for hit in structure['CHAIN'].alignments]))
    subfamilies = list(
        set([hit.tName.split('|')[2]
             for hit in structure['CHAIN'].alignments]))

    # a) L1 insertion
    if 'L1' in families:
        structure['FAM'] = 'L1'
        structure['SUBFAM'] = ','.join(subfamilies)

    # b) SVA insertion
    elif 'SVA' in families:
        structure['FAM'] = 'SVA'
        structure['SUBFAM'] = ','.join(subfamilies)

    # c) Alu insertion
    elif 'Alu' in families:
        structure['FAM'] = 'Alu'
        structure['SUBFAM'] = ','.join(subfamilies)

    elif ('L1' in families) and ('Alu' in families):
        print('FUSION: ', insertSeq)

    ## 3. Search for polyA/T tails at unresolved insert ends and determine insertion type
    structure['ITYPE'] = 'solo'
    rtBeg, rtEnd = structure['CHAIN'].interval()

    ## Set parameters for monomer search
    windowSize = 8
    maxWindowDist = 2
    minMonomerSize = 10
    minPurity = 80

    ### 3.1 PolyA search
    ## Search for monomers
    targetSeq = insertSeq[rtEnd:]
    monomersA = sequences.find_monomers(targetSeq, 'A', windowSize,
                                        maxWindowDist, minMonomerSize,
                                        minPurity)

    ## Map to insert sequence coordinates
    for monomer in monomersA:
        monomer.beg = monomer.beg + rtEnd
        monomer.end = monomer.end + rtEnd

    ## Make polyA calls
    structure['POLYA'] = 0
    structure['STRAND'] = None

    # a) Single polyA
    if (len(monomersA) == 1):
        dist2rt = monomersA[0].beg - rtEnd
        dist2end = structure['LEN'] - monomersA[0].end

        # Apply filter
        if (dist2rt <= 30) and (dist2end <= 30):
            structure['ITYPE'] = 'solo'
            structure['POLYA'] = 1
            structure['STRAND'] = '+'

    # b) Multiple polyA (Transduction candidate)
    # AAAAAAAAAAAAAA-------------AAAAAAAAAAAAAA
    elif (len(monomersA) > 1):
        dist2rt = monomersA[0].beg - rtEnd
        dist2end = structure['LEN'] - monomersA[-1].end

        # Apply filter
        if (dist2rt <= 30) and (dist2end <= 30):
            structure['ITYPE'] = 'partnered'
            structure['3PRIME'] = True
            structure['POLYA'] = len(monomersA)
            structure['STRAND'] = '+'

    ## 3.2 PolyT search
    ## Search for monomers
    targetSeq = insertSeq[:rtBeg]
    monomersT = sequences.find_monomers(targetSeq, 'T', windowSize,
                                        maxWindowDist, minMonomerSize,
                                        minPurity)

    ## Make polyT calls
    structure['POLYT'] = 0

    # a) Single polyT
    if (len(monomersT) == 1):
        dist2end = monomersT[0].beg
        dist2rt = rtBeg - monomersT[0].end

        # Apply filter
        if (dist2rt <= 30) and (dist2end <= 30):
            structure['ITYPE'] = 'solo'
            structure['POLYT'] = 1
            structure['STRAND'] = '-'

    # b) Multiple polyT (Partnered transduction candidate)
    # TTTTTTTTTTTTT-------------TTTTTTTTTTTTT
    elif (len(monomersT) == 2):
        dist2end = monomersT[0].beg
        dist2rt = rtBeg - monomersT[-1].end

        # Apply filter
        if (dist2rt <= 30) and (dist2end <= 30):
            structure['ITYPE'] = 'partnered'
            structure['3PRIME'] = True
            structure['POLYT'] = len(monomersT)
            structure['STRAND'] = '-'

    ## 3.3 Determine if polyA or polyT found
    # a) PolyA found
    if (structure['POLYA'] != 0) and (structure['POLYT'] == 0):

        tail = 'polyA'
        structure['NBPOLY'] = structure['POLYA']
        monomers = monomersA

    # b) PolyT found
    elif (structure['POLYT'] != 0) and (structure['POLYA'] == 0):
        tail = 'polyT'
        structure['NBPOLY'] = structure['POLYT']
        monomers = monomersT

    # c) No tail or ambiguous
    else:
        tail = None
        structure['NBPOLY'] = 0

    ## 3.4 Determine candidate insertion type based on the number of polyA/T tails found
    hits2add = []

    # a) Solo
    if structure['NBPOLY'] == 1:
        fields = [
            structure['CHAIN'].alignments[0].qName, structure['LEN'],
            monomers[0].beg, monomers[0].end, None, 'PolyA/T', 0, 0, 0, 0, 0, 0
        ]
        hit = formats.PAF_alignment(fields)
        hits2add.append(hit)

    # b) Partnered
    elif structure['NBPOLY'] > 1:

        ## First polyA/T
        fields = [
            structure['CHAIN'].alignments[0].qName, structure['LEN'],
            monomers[0].beg, monomers[0].end, None, 'PolyA/T', 0, 0, 0, 0, 0, 0
        ]
        hit = formats.PAF_alignment(fields)
        hits2add.append(hit)

        ## Add partnered region/s plus polyAT/s
        counter = 1

        for monomer1, monomer2 in zip(monomers, monomers[1:]):

            # Partnered
            fields = [
                structure['CHAIN'].alignments[0].qName, structure['LEN'],
                monomer1.end, monomer2.beg, None, 'Partnered_' + str(counter),
                0, 0, 0, 0, 0, 0
            ]
            hit = formats.PAF_alignment(fields)
            hits2add.append(hit)

            # Next polyA/T
            fields = [
                structure['CHAIN'].alignments[0].qName, structure['LEN'],
                monomer2.beg, monomer2.end, None, 'PolyA/T', 0, 0, 0, 0, 0, 0
            ]
            hit = formats.PAF_alignment(fields)
            hits2add.append(hit)

            counter += 1

    ## 3.5 Add polyA/T plus transduced annotation to the chain
    if tail == 'polyA':
        structure[
            'CHAIN'].alignments = structure['CHAIN'].alignments + hits2add

    elif tail == 'polyT':
        structure[
            'CHAIN'].alignments = hits2add + structure['CHAIN'].alignments

    ## 4. Infer inserted sequence length
    lengths = retrotransposons.infer_lengths('solo', structure['CHAIN'],
                                             structure['STRAND'])
    structure.update(lengths)

    ## 5. Assess ORFs status for L1 insertions
    if structure['FAM'] == 'L1':
        orf1, orf2 = retrotransposons.find_orf(insertSeq)[0:2]
        structure['ORF1'] = True if orf1 is not None else False
        structure['ORF2'] = True if orf2 is not None else False
        structure['COMPETENT'] = True if (structure['ORF1']
                                          and structure['ORF2']
                                          and structure['IS_FULL']) else False

    ## 6. Apply filters
    failed = []

    # 6.1 Percentage resolved filter
    # Compute % of insertion resolved
    structure['PERC-RESOLVED'] = structure['CHAIN'].perc_query_covered()

    if structure['PERC-RESOLVED'] < 60:
        failed.append('PERC-RESOLVED')

    # 6.2 Length filtering for solo insertions
    if structure['ITYPE'] == 'solo':

        if (structure['FAM'] == 'L1') and (structure['LEN'] > 6500):
            failed.append('LEN')

        elif (structure['FAM'] == 'Alu') and (structure['LEN'] > 400):
            failed.append('LEN')

        elif (structure['FAM'] == 'SVA') and (structure['LEN'] > 5000):
            failed.append('LEN')

    structure['FAILED'] = failed

    # a) Insertions passes all the filters
    if not failed:
        structure['PASS'] = True

    # b) At least one failed filter
    else:
        structure['PASS'] = False

    return structure