def polyA_len(sequence): ''' Compute poly(A) length ''' ## Configuration for monomere search: windowSize = 8 maxWindowDist = 2 minMonomerSize = 10 minPurity = 80 ## Seach poly(A) monomers targetMonomer = 'A' monomersA = sequences.find_monomers(sequence, targetMonomer, windowSize, maxWindowDist, minMonomerSize, minPurity) if not monomersA: return 0 ## Select monomer closest to sequence end candidate = monomersA[-1] ## Filter out monomer if more than Xbp from end seqLen = len(sequence) dist2end = seqLen - candidate.end if dist2end <= 30: polyAlen = candidate.end - candidate.beg else: polyAlen = 0 return polyAlen
def trim_polyA(sequence): ''' Trim poly(A) at sequence end ''' ## Configuration for monomere search: windowSize = 8 maxWindowDist = 2 minMonomerSize = 10 minPurity = 80 ## Seach poly(A) monomers targetMonomer = 'A' monomersA = sequences.find_monomers(sequence, targetMonomer, windowSize, maxWindowDist, minMonomerSize, minPurity) if not monomersA: return sequence ## Select monomer closest to sequence end candidate = monomersA[-1] ## Filter out monomer if more than Xbp from end seqLen = len(sequence) dist2end = seqLen - candidate.end if dist2end <= 30: sequence = sequence[:candidate.beg] else: sequence = sequence return sequence
def search4polyT(sequence): ''' Search for poly(T) at sequence begin ''' ## Configuration for monomere search: windowSize = 8 maxWindowDist = 2 minMonomerSize = 10 minPurity = 80 ## Seach poly(T) monomers targetMonomer = 'T' monomersT = sequences.find_monomers(sequence, targetMonomer, windowSize, maxWindowDist, minMonomerSize, minPurity) if not monomersT: return False, None ## Select monomer closest to sequence beg candidate = monomersT[0] ## Filter out monomer if more than Xbp from beg dist2beg = candidate.beg if dist2beg <= 30: polyT = True else: polyT = False return polyT, candidate
def search4polyA(sequence): ''' Search for poly(A) at sequence end ''' ## Configuration for monomere search: windowSize = 8 maxWindowDist = 2 minMonomerSize = 10 minPurity = 80 ## Seach poly(A) monomers targetMonomer = 'A' monomersA = sequences.find_monomers(sequence, targetMonomer, windowSize, maxWindowDist, minMonomerSize, minPurity) if not monomersA: return False, None ## Select monomer closest to sequence end candidate = monomersA[-1] ## Filter out monomer if more than Xbp from end seqLen = len(sequence) dist2end = seqLen - candidate.end if dist2end <= 30: polyA = True else: polyA = False return polyA, candidate
def has_polyA_illumina(targetSeq): ''' Search for polyA/polyT tails in consensus sequence of Illumina clipping events Input: 1. targetSeq: consensus sequence of Illumina clipping events Output: 1. has_polyA: True or False ''' ## 0. Set up monomer searching parameters ## windowSize = 8 maxWindowDist = 2 minMonomerSize = 8 minPurity = 95 maxDist2Ends = 1 monomerTails = [] ## 1. Search for polyA/polyT at the sequence ends ## targetMonomers = ['T', 'A'] for targetMonomer in targetMonomers: monomers = sequences.find_monomers(targetSeq, targetMonomer, windowSize, maxWindowDist, minMonomerSize, minPurity) filtMonomers = sequences.filter_internal_monomers( monomers, targetSeq, maxDist2Ends, minMonomerSize) monomerTails += filtMonomers if filtMonomers is not None else monomerTails while [] in monomerTails: monomerTails.remove([]) has_polyA = True if monomerTails != [] else False return has_polyA
def trim_3prime_td(sequence): ''' Trim 3 prime transduction at sequence end ''' ## Configuration for monomere search: windowSize = 8 maxWindowDist = 2 minMonomerSize = 10 minPurity = 80 ## Seach poly(A) monomers targetMonomer = 'A' monomersA = sequences.find_monomers(sequence, targetMonomer, windowSize, maxWindowDist, minMonomerSize, minPurity) if not monomersA: return sequence ## Select second closest monomer to the sequence end monomerA = monomersA[-2:][0] trimmed = sequence[:monomerA.beg] return trimmed
def infer_strand_polyA(sequence, chain): ''' Infer insertion strand based on two criteria: 1) Location of polyA/T tail at sequence ends 2) Alignment strand for the insert 3' end over the template sequence Input: 1. sequence: consensus inserted sequence 2. chain: Sequence chain of alignments over retrotranposon consensus sequences and/or transduced regions Output: 1. strand: Insertion strand (+, - or None) 2. polyA: boolean specifying if polyA/T sequence was found ''' ### Set up configuration parameters windowSize = 8 maxWindowDist = 2 minMonomerSize = 10 minPurity = 80 maxDist2Ends = 10 minInternalMonomerSize = 20 ## 1. Search for polyA at the insert 3' end ## # 1.1 Extract unaligned 3' end of the inserted sequence lastHit = chain.alignments[-1] targetSeq = sequence[lastHit.qEnd:] # 1.2 Search for poly(A) monomers on the 3' end targetMonomer = 'A' monomers3end = sequences.find_monomers(targetSeq, targetMonomer, windowSize, maxWindowDist, minMonomerSize, minPurity) # 1.3 Filter internal monomers monomers3end = sequences.filter_internal_monomers(monomers3end, targetSeq, maxDist2Ends, minInternalMonomerSize) ## 2. Search for polyT at the insert 5' end ## # 2.1 Extract unaligned 5' end of the inserted sequence firstHit = chain.alignments[0] targetSeq = sequence[:firstHit.qBeg] # 2.2 Search for poly(T) monomers on the 5' end targetMonomer = 'T' monomers5end = sequences.find_monomers(targetSeq, targetMonomer, windowSize, maxWindowDist, minMonomerSize, minPurity) # 2.3 Filter internal monomers monomers5end = sequences.filter_internal_monomers(monomers5end, targetSeq, maxDist2Ends, minInternalMonomerSize) ## 3. Determine strand ## # 3.1 Compute 3' monomers accumulative length monomers3endLengths = [monomer.length() for monomer in monomers3end] accumulative3prime = sum(monomers3endLengths) # 3.2 Compute 5' monomers accumulative length monomers5endLengths = [monomer.length() for monomer in monomers5end] accumulative5prime = sum(monomers5endLengths) # 3.3 Determine if polyA/T at 5' or 3' end (indicative of strand orientation) # a) Unknown strand if: # - No monomer found in any end OR # - Ambiguity as 3' and 5' monomers with equal size if ((accumulative3prime == 0) and (accumulative5prime == 0)) or (accumulative3prime == accumulative5prime): monomers = None strand = None polyA = False # b) Positive strand elif accumulative3prime > accumulative5prime: monomers = monomers3end strand = '+' polyA = True # c) Negative strand else: monomers = monomers5end strand = '-' polyA = True ## 4. Convert monomer coordinates to inserted sequence space ## ## a) + strand # -----insert------**unaligned** if strand == '+': for monomer in monomers: monomer.beg = monomer.beg + lastHit.qEnd monomer.end = monomer.end + lastHit.qEnd ## b) - strand # NOT NEEDED as the unaligned sequence correspond to the leftmost end of the insert # **unaligned**-----insert------ ## 5. Add polyA/T to the chain of alignments firstAlignment = chain.alignments[0] ## a) + strand if strand == '+': # For each monomer for monomer in monomers: # Create PAF line containing poly(A) info fields = [ firstAlignment.qName, firstAlignment.qLen, monomer.beg, monomer.end, None, 'poly(A/T)', 0, 0, 0, 0, 0, 0 ] alignment = formats.PAF_alignment(fields) # Add to the end of the chain chain.alignments.append(alignment) ## b) - strand elif strand == '-': # For each monomer for monomer in monomers[::-1]: # Create PAF line containing poly(T) info fields = [ firstAlignment.qName, firstAlignment.qLen, monomer.beg, monomer.end, None, 'poly(A/T)', 0, 0, 0, 0, 0, 0 ] alignment = formats.PAF_alignment(fields) # Add to the begin of the chain chain.alignments.insert(0, alignment) return strand, polyA
def MEI_structure(PAF, insertSeq): ''' ''' structure = {} structure['LEN'] = len(insertSeq) ## 1. Chain alignments structure['CHAIN'] = PAF.chain(20, 50) ## 2. Determine insertion family families = list( set([hit.tName.split('|')[1] for hit in structure['CHAIN'].alignments])) subfamilies = list( set([hit.tName.split('|')[2] for hit in structure['CHAIN'].alignments])) # a) L1 insertion if 'L1' in families: structure['FAM'] = 'L1' structure['SUBFAM'] = ','.join(subfamilies) # b) SVA insertion elif 'SVA' in families: structure['FAM'] = 'SVA' structure['SUBFAM'] = ','.join(subfamilies) # c) Alu insertion elif 'Alu' in families: structure['FAM'] = 'Alu' structure['SUBFAM'] = ','.join(subfamilies) elif ('L1' in families) and ('Alu' in families): print('FUSION: ', insertSeq) ## 3. Search for polyA/T tails at unresolved insert ends and determine insertion type structure['ITYPE'] = 'solo' rtBeg, rtEnd = structure['CHAIN'].interval() ## Set parameters for monomer search windowSize = 8 maxWindowDist = 2 minMonomerSize = 10 minPurity = 80 ### 3.1 PolyA search ## Search for monomers targetSeq = insertSeq[rtEnd:] monomersA = sequences.find_monomers(targetSeq, 'A', windowSize, maxWindowDist, minMonomerSize, minPurity) ## Map to insert sequence coordinates for monomer in monomersA: monomer.beg = monomer.beg + rtEnd monomer.end = monomer.end + rtEnd ## Make polyA calls structure['POLYA'] = 0 structure['STRAND'] = None # a) Single polyA if (len(monomersA) == 1): dist2rt = monomersA[0].beg - rtEnd dist2end = structure['LEN'] - monomersA[0].end # Apply filter if (dist2rt <= 30) and (dist2end <= 30): structure['ITYPE'] = 'solo' structure['POLYA'] = 1 structure['STRAND'] = '+' # b) Multiple polyA (Transduction candidate) # AAAAAAAAAAAAAA-------------AAAAAAAAAAAAAA elif (len(monomersA) > 1): dist2rt = monomersA[0].beg - rtEnd dist2end = structure['LEN'] - monomersA[-1].end # Apply filter if (dist2rt <= 30) and (dist2end <= 30): structure['ITYPE'] = 'partnered' structure['3PRIME'] = True structure['POLYA'] = len(monomersA) structure['STRAND'] = '+' ## 3.2 PolyT search ## Search for monomers targetSeq = insertSeq[:rtBeg] monomersT = sequences.find_monomers(targetSeq, 'T', windowSize, maxWindowDist, minMonomerSize, minPurity) ## Make polyT calls structure['POLYT'] = 0 # a) Single polyT if (len(monomersT) == 1): dist2end = monomersT[0].beg dist2rt = rtBeg - monomersT[0].end # Apply filter if (dist2rt <= 30) and (dist2end <= 30): structure['ITYPE'] = 'solo' structure['POLYT'] = 1 structure['STRAND'] = '-' # b) Multiple polyT (Partnered transduction candidate) # TTTTTTTTTTTTT-------------TTTTTTTTTTTTT elif (len(monomersT) == 2): dist2end = monomersT[0].beg dist2rt = rtBeg - monomersT[-1].end # Apply filter if (dist2rt <= 30) and (dist2end <= 30): structure['ITYPE'] = 'partnered' structure['3PRIME'] = True structure['POLYT'] = len(monomersT) structure['STRAND'] = '-' ## 3.3 Determine if polyA or polyT found # a) PolyA found if (structure['POLYA'] != 0) and (structure['POLYT'] == 0): tail = 'polyA' structure['NBPOLY'] = structure['POLYA'] monomers = monomersA # b) PolyT found elif (structure['POLYT'] != 0) and (structure['POLYA'] == 0): tail = 'polyT' structure['NBPOLY'] = structure['POLYT'] monomers = monomersT # c) No tail or ambiguous else: tail = None structure['NBPOLY'] = 0 ## 3.4 Determine candidate insertion type based on the number of polyA/T tails found hits2add = [] # a) Solo if structure['NBPOLY'] == 1: fields = [ structure['CHAIN'].alignments[0].qName, structure['LEN'], monomers[0].beg, monomers[0].end, None, 'PolyA/T', 0, 0, 0, 0, 0, 0 ] hit = formats.PAF_alignment(fields) hits2add.append(hit) # b) Partnered elif structure['NBPOLY'] > 1: ## First polyA/T fields = [ structure['CHAIN'].alignments[0].qName, structure['LEN'], monomers[0].beg, monomers[0].end, None, 'PolyA/T', 0, 0, 0, 0, 0, 0 ] hit = formats.PAF_alignment(fields) hits2add.append(hit) ## Add partnered region/s plus polyAT/s counter = 1 for monomer1, monomer2 in zip(monomers, monomers[1:]): # Partnered fields = [ structure['CHAIN'].alignments[0].qName, structure['LEN'], monomer1.end, monomer2.beg, None, 'Partnered_' + str(counter), 0, 0, 0, 0, 0, 0 ] hit = formats.PAF_alignment(fields) hits2add.append(hit) # Next polyA/T fields = [ structure['CHAIN'].alignments[0].qName, structure['LEN'], monomer2.beg, monomer2.end, None, 'PolyA/T', 0, 0, 0, 0, 0, 0 ] hit = formats.PAF_alignment(fields) hits2add.append(hit) counter += 1 ## 3.5 Add polyA/T plus transduced annotation to the chain if tail == 'polyA': structure[ 'CHAIN'].alignments = structure['CHAIN'].alignments + hits2add elif tail == 'polyT': structure[ 'CHAIN'].alignments = hits2add + structure['CHAIN'].alignments ## 4. Infer inserted sequence length lengths = retrotransposons.infer_lengths('solo', structure['CHAIN'], structure['STRAND']) structure.update(lengths) ## 5. Assess ORFs status for L1 insertions if structure['FAM'] == 'L1': orf1, orf2 = retrotransposons.find_orf(insertSeq)[0:2] structure['ORF1'] = True if orf1 is not None else False structure['ORF2'] = True if orf2 is not None else False structure['COMPETENT'] = True if (structure['ORF1'] and structure['ORF2'] and structure['IS_FULL']) else False ## 6. Apply filters failed = [] # 6.1 Percentage resolved filter # Compute % of insertion resolved structure['PERC-RESOLVED'] = structure['CHAIN'].perc_query_covered() if structure['PERC-RESOLVED'] < 60: failed.append('PERC-RESOLVED') # 6.2 Length filtering for solo insertions if structure['ITYPE'] == 'solo': if (structure['FAM'] == 'L1') and (structure['LEN'] > 6500): failed.append('LEN') elif (structure['FAM'] == 'Alu') and (structure['LEN'] > 400): failed.append('LEN') elif (structure['FAM'] == 'SVA') and (structure['LEN'] > 5000): failed.append('LEN') structure['FAILED'] = failed # a) Insertions passes all the filters if not failed: structure['PASS'] = True # b) At least one failed filter else: structure['PASS'] = False return structure