def merge_pacbporfs_with_closeby_independant_introns(pacbporfD,pacbporfA, verbose=False,**kwargs): """ Merge 2 PacbPORF objects by closeby independant gained introns @attention: see pacb.connecting.merge_orfs_with_intron for **kwargs) @type pacbporfD: PacbPORF object @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects @type pacbporfA: PacbPORF object @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects @type verbose: Boolean @param verbose: print status/debugging messages to STDOUT @rtype: list @return: list with ( intronQ, intronS, CIGexonPacbPORF ) """ # input validation IsPacbPORF(pacbporfD) IsPacbPORF(pacbporfA) # edit **kwargs dictionary for some forced attributes kwargs['allow_phase_shift'] = True _update_kwargs(kwargs,KWARGS_CLOSEBY_INDEPENDANT_INTRON_GAIN) if not kwargs.has_key('aligned_site_max_triplet_distance'): kwargs['aligned_site_max_triplet_distance'] = kwargs['cig_max_aa_length'] # run regular merge_pacbporfs_with_introns function alg_introns = merge_pacbporfs_with_introns(pacbporfD,pacbporfA,verbose=verbose,**kwargs) cig_introns = [] if verbose: print "introns::", len(alg_introns), "cig_max_aa_length:", kwargs['cig_max_aa_length'], kwargs['aligned_site_max_triplet_distance'] # check if there is length congruence between the cig_introns for intQ,intS in alg_introns: dQpos, dQphase = pacbporfD.dnaposition_query(intQ.donor.pos,forced_return=True) dSpos, dSphase = pacbporfD.dnaposition_sbjct(intS.donor.pos,forced_return=True) aQpos, aQphase = pacbporfA.dnaposition_query(intQ.acceptor.pos,forced_return=True) aSpos, aSphase = pacbporfA.dnaposition_sbjct(intS.acceptor.pos,forced_return=True) distDnt = (dQpos*3 + dQphase) - (dSpos*3 + dSphase) distAnt = (aQpos*3 + aQphase) - (aSpos*3 + aSphase) ######################################################################## if verbose: print (intQ.donor.pos, intQ.acceptor.pos), print (intS.donor.pos, intS.acceptor.pos), print distDnt, distAnt, kwargs['max_nt_offset'] ######################################################################## if abs(distDnt-distAnt) > kwargs['max_nt_offset']: # intermediate ciigPacbPORF has query vs sbjct length discrepancy # *3 for AA2nt coordinate conversion, +2 to allow different phases # e.g. phase difference can give 1AA+2nt difference continue if intQ.donor.phase == intS.donor.phase and\ (distDnt/3) <= kwargs['aligned_site_max_triplet_distance']: # a regularly merged intron combination continue if intQ.acceptor.phase == intS.acceptor.phase and\ (distAnt/3) <= kwargs['aligned_site_max_triplet_distance']: # a regularly merged intron combination continue if abs(distDnt) <= 5 or abs(distDnt) <= 5: # most likely a splice site phase shift, not a c.i.g. continue if abs(distDnt/3) >= kwargs['cig_min_aa_length'] and\ abs(distAnt/3) >= kwargs['cig_min_aa_length'] and\ abs(distDnt/3) <= kwargs['cig_max_aa_length'] and\ abs(distAnt/3) <= kwargs['cig_max_aa_length']: # putatively a closeby independant (intron) gain cig_introns.append( ( intQ, intS ) ) ############################################################################ if verbose: for intQ,intS in cig_introns: print "cig?:", (intQ.donor.pos, intQ.acceptor.pos), print (intS.donor.pos, intS.acceptor.pos) ############################################################################ # return variable to store found positive cases of CIG into found_cig_list = [] # check if there is some sequence similarity for intQ,intS in cig_introns: # get alignment positions around query & sbjcts splice sites dQpos, dQphase = pacbporfD.dnaposition_query(intQ.donor.pos,forced_return=True) dSpos, dSphase = pacbporfD.dnaposition_sbjct(intS.donor.pos,forced_return=True) aQpos, aQphase = pacbporfA.dnaposition_query(intQ.acceptor.pos,forced_return=True) aSpos, aSphase = pacbporfA.dnaposition_sbjct(intS.acceptor.pos,forced_return=True) distD = dQpos - dSpos distA = aQpos - aSpos distDnt = (dQpos*3 + dQphase) - (dSpos*3 + dSphase) distAnt = (aQpos*3 + aQphase) - (aSpos*3 + aSphase) if distDnt > 0: # then, distAnt is as well > 0 # QUERY is extended on the donor side #mode = "SQ" #qStart = pacbporfD._positions[dSpos].query_pos #qEnd = qStart + distD #sStart = pacbporfA._positions[aSpos].sbjct_pos #sEnd = sStart + distD #qSeq = pacbporfD.orfQ.getaas(abs_pos_start=qStart,abs_pos_end=qEnd) #sSeq = pacbporfA.orfS.getaas(abs_pos_start=sStart,abs_pos_end=sEnd) mode = "SQ" qEnd = pacbporfD.orfQ.dnapos2aapos(intQ.donor.pos) qStart= qEnd - max([distA,distD]) sStart= pacbporfA.orfS.dnapos2aapos(intS.acceptor.pos) sEnd = sStart + max([distA,distD]) qSeq = pacbporfD.orfQ.getaas(abs_pos_start=qStart,abs_pos_end=qEnd) sSeq = pacbporfA.orfS.getaas(abs_pos_start=sStart,abs_pos_end=sEnd) else: # distDnt and distAnt are < 0 ## SBJCT is extended on the donor site #mode = "QS" #qStart = pacbporfA._positions[aQpos].query_pos #qEnd = qStart - distA #sStart = pacbporfD._positions[dQpos].sbjct_pos #sEnd = sStart - distA #qSeq = pacbporfA.orfQ.getaas(abs_pos_start=qStart, abs_pos_end=qEnd) #sSeq = pacbporfD.orfS.getaas(abs_pos_start=sStart, abs_pos_end=sEnd) mode = "QS" qStart= pacbporfA.orfQ.dnapos2aapos(intQ.acceptor.pos) qEnd = qStart - min([distA,distD]) sEnd = pacbporfD.orfS.dnapos2aapos(intS.donor.pos) sStart= sEnd + min([distA,distD]) qSeq = pacbporfA.orfQ.getaas(abs_pos_start=qStart,abs_pos_end=qEnd) sSeq = pacbporfD.orfS.getaas(abs_pos_start=sStart,abs_pos_end=sEnd) headerQ = "query_%s_%s_%s" % (qStart,qEnd,qSeq) headerS = "sbjct_%s_%s_%s" % (sStart,sEnd,sSeq) headerQ = headerQ[0:20] # truncate to prevent error headerS = headerS[0:20] # truncate to prevent error if verbose: print mode, (distD,distA), qSeq, sSeq, headerQ, headerS, distDnt, distAnt, print dQpos, aQpos, dSpos, aSpos if not qSeq: continue # superfluous check-doublecheck for sequence if not sSeq: continue # superfluous check-doublecheck for sequence #################################################### # make PacbPORF with ClustalW #################################################### # align the sequences with clustalw seqs = { headerQ: qSeq, headerS: sSeq } (alignedseqs,alignment) = clustalw(seqs=seqs) # make pacbp from clustalw alignment pacbp = pacbp_from_clustalw( alignment=( alignedseqs[headerQ], alignment, alignedseqs[headerS] ), coords=(qStart,qEnd,sStart,sEnd) ) if not pacbp: continue # strip unaligned fraction of this pacbp object, then check length pacbp.strip_unmatched_ends() if len(pacbp) < kwargs['cig_min_aa_length']: continue if len(pacbp) > kwargs['cig_max_aa_length']: continue if pacbp: # initialize extended tiny PacbPORF caused by c.i.g. if distDnt > 0: cig_pacbporf = pacbp2pacbporf(pacbp,pacbporfD.orfQ,pacbporfA.orfS) else: cig_pacbporf = pacbp2pacbporf(pacbp,pacbporfA.orfQ,pacbporfD.orfS) cig_pacbporf.extend_pacbporf_after_stops() #################################################################### if verbose: print pacbp, len(pacbp) print cig_pacbporf print "CIG:", intQ print "CIG:", intS print distD, distA, distDnt, distAnt cig_pacbporf.print_protein_and_dna() #################################################################### #################################################################### # set some meta-data properties to the intron objects #################################################################### # add distance score to introns # The distance set in merge_pacbporfs_with_introns is large; # it is the actual distance between the splice sites. In CIG, # the measure for distance is the length difference between # the offset between query and sbjct measured on the cig_pacbporf intQ._distance = abs(distDnt-distAnt) intS._distance = abs(distDnt-distAnt) if distDnt > 0: # then, distAnt is as well > 0 # QUERY is extended on the donor side # add Alignment Positional Periphery Score into objects succes = set_apps_intron_query(intQ,cig_pacbporf,pacbporfA) succes = set_apps_intron_sbjct(intS,pacbporfD,cig_pacbporf) else: # SBJCT is extended on the donor side # add Alignment Positional Periphery Score into objects succes = set_apps_intron_query(intQ,pacbporfD,cig_pacbporf) succes = set_apps_intron_sbjct(intS,cig_pacbporf,pacbporfA) # set GFF fsource attribute for recognition of intron sources intQ._gff['fsource'] = "ABGPcig" intS._gff['fsource'] = "ABGPcig" # create _linked_to_xxx attributes intQ._linked_to_pacbporfs = [ cig_pacbporf ] intS._linked_to_pacbporfs = [ cig_pacbporf ] # append to found_cig_list found_cig_list.append( ( intQ, intS, cig_pacbporf ) ) else: # no alignment possible -> try next continue # return lists of closeby_independant_introns return found_cig_list
def update_PCG_with_signalpexons(signalpexonseqs, PCG, OPTIONS, min_pacbporf_identityscore=0.20, verbose=True): """ """ if not signalpexonseqs.has_key(OPTIONS.target): return False is_any_pacbporf_added = False for targetSPexon in signalpexonseqs[OPTIONS.target]: target = OPTIONS.target for informant, infSPlist in signalpexonseqs.iteritems(): if informant == OPTIONS.target: continue # check if informant has been deleted in the meanwhile if informant not in PCG.organism_set(): continue # list to store signalp exons into signalpexon_pacbp_list = [] # get ordered pacbporfs fromt he PCG thepacbporfs = order_pacbporf_list( PCG.get_pacbps_by_organisms(OPTIONS.target, informant)) if not thepacbporfs: # no alignments present for this organism (can happen!) continue for informantSPexon in infSPlist: coords = [ targetSPexon.protein_start(), targetSPexon.protein_end(), informantSPexon.protein_start(), informantSPexon.protein_end(), ] # prior to making ClustalW-PacbP, check PacbPCOORD placeability # into the list of pacbporfs pacbpCoordsObj = PacbPCOORDS(input=( targetSPexon.proteinsequence(), informantSPexon.proteinsequence(), targetSPexon.protein_start(), informantSPexon.protein_start(), )) if False in [ pacbpCoordsObj.is_positioned_compatibly(pacbporf) for pacbporf in thepacbporfs ]: # *NOT* placable in current ordered list of PacbPORFS continue dist = pacbpCoordsObj.distance_towards(thepacbporfs[0]) if dist > SIGNALP_FIRSTEXON_MAX_INTRON_NT_LENGTH / 3: # WAY TO FAR in front of current gene structure parts. # Do not allow (pooras a *NOT* placable in current ordered list of PacbPORFS continue elif dist == 0: # NOT placeable in front of the rest of the PacbPORFS. continue else: pass # perform ClustalW alignment on the SP exons (alignedseqs,alignment) =\ clustalw( seqs= { OPTIONS.target: targetSPexon.proteinsequence(), informant: informantSPexon.proteinsequence() } ) # make pacbp from clustalw alignment pacbp = pacbp_from_clustalw( alignment=(alignedseqs[OPTIONS.target], alignment, alignedseqs[informant]), coords=coords) # is there any alignment constructed? if not pacbp: continue # ignore (very) poor identyscore alignments if pacbp.identityscore < min_pacbporf_identityscore: continue # if here make extended pacbpORF signalpexonPacbpORF = pacbp2pacbporf(pacbp, targetSPexon.orf, informantSPexon.orf) signalpexonPacbpORF.extend_pacbporf_after_stops() # and store in signalpexon_pacbp_list signalpexon_pacbp_list.append(signalpexonPacbpORF) ################################################################ if verbose: print alignedseqs[OPTIONS.target], OPTIONS.target print alignment print alignedseqs[informant], informant if pacbp: print pacbp, (OPTIONS.target, targetSPexon.orf.id), print(informant, informantSPexon.orf.id), print "DISTANCE::", dist pacbp.print_protein() print "" ################################################################ # If there are signalpexon-guided pacbporfs found, store the one # with the highest bitscore if signalpexon_pacbp_list: signalpexon_pacbp_list = order_list_by_attribute( signalpexon_pacbp_list, order_by='bits', reversed=True) # store best bitscoring pacbporf to PCG signalp_pacbporf = signalpexon_pacbp_list[0] pacbporf2PCG(signalp_pacbporf, OPTIONS.target, informant, PCG, source='SignalP-ClustalW') is_any_pacbporf_added = True #################################################################### if verbose: print "SignalP Exon added to PCG:", signalp_pacbporf, informant #################################################################### else: pass # return pointer is_any_pacbporf_added return is_any_pacbporf_added
def update_PCG_with_signalpexons(signalpexonseqs,PCG,OPTIONS, min_pacbporf_identityscore=0.20,verbose=True): """ """ if not signalpexonseqs.has_key(OPTIONS.target): return False is_any_pacbporf_added = False for targetSPexon in signalpexonseqs[OPTIONS.target]: target = OPTIONS.target for informant,infSPlist in signalpexonseqs.iteritems(): if informant == OPTIONS.target: continue # check if informant has been deleted in the meanwhile if informant not in PCG.organism_set(): continue # list to store signalp exons into signalpexon_pacbp_list = [] # get ordered pacbporfs fromt he PCG thepacbporfs = order_pacbporf_list(PCG.get_pacbps_by_organisms(OPTIONS.target,informant)) if not thepacbporfs: # no alignments present for this organism (can happen!) continue for informantSPexon in infSPlist: coords = [ targetSPexon.protein_start(), targetSPexon.protein_end(), informantSPexon.protein_start(), informantSPexon.protein_end(), ] # prior to making ClustalW-PacbP, check PacbPCOORD placeability # into the list of pacbporfs pacbpCoordsObj = PacbPCOORDS(input=( targetSPexon.proteinsequence(), informantSPexon.proteinsequence(), targetSPexon.protein_start(), informantSPexon.protein_start(), ) ) if False in [ pacbpCoordsObj.is_positioned_compatibly(pacbporf) for pacbporf in thepacbporfs ]: # *NOT* placable in current ordered list of PacbPORFS continue dist = pacbpCoordsObj.distance_towards(thepacbporfs[0]) if dist > SIGNALP_FIRSTEXON_MAX_INTRON_NT_LENGTH/3: # WAY TO FAR in front of current gene structure parts. # Do not allow (pooras a *NOT* placable in current ordered list of PacbPORFS continue elif dist == 0: # NOT placeable in front of the rest of the PacbPORFS. continue else: pass # perform ClustalW alignment on the SP exons (alignedseqs,alignment) =\ clustalw( seqs= { OPTIONS.target: targetSPexon.proteinsequence(), informant: informantSPexon.proteinsequence() } ) # make pacbp from clustalw alignment pacbp = pacbp_from_clustalw( alignment=( alignedseqs[OPTIONS.target], alignment, alignedseqs[informant] ), coords=coords ) # is there any alignment constructed? if not pacbp: continue # ignore (very) poor identyscore alignments if pacbp.identityscore < min_pacbporf_identityscore: continue # if here make extended pacbpORF signalpexonPacbpORF = pacbp2pacbporf(pacbp, targetSPexon.orf,informantSPexon.orf) signalpexonPacbpORF.extend_pacbporf_after_stops() # and store in signalpexon_pacbp_list signalpexon_pacbp_list.append( signalpexonPacbpORF ) ################################################################ if verbose: print alignedseqs[OPTIONS.target], OPTIONS.target print alignment print alignedseqs[informant], informant if pacbp: print pacbp, (OPTIONS.target, targetSPexon.orf.id), print (informant, informantSPexon.orf.id), print "DISTANCE::", dist pacbp.print_protein() print "" ################################################################ # If there are signalpexon-guided pacbporfs found, store the one # with the highest bitscore if signalpexon_pacbp_list: signalpexon_pacbp_list = order_list_by_attribute( signalpexon_pacbp_list,order_by='bits',reversed=True) # store best bitscoring pacbporf to PCG signalp_pacbporf = signalpexon_pacbp_list[0] pacbporf2PCG(signalp_pacbporf,OPTIONS.target,informant,PCG,source='SignalP-ClustalW') is_any_pacbporf_added = True #################################################################### if verbose: print "SignalP Exon added to PCG:", signalp_pacbporf, informant #################################################################### else: pass # return pointer is_any_pacbporf_added return is_any_pacbporf_added
def clustalwinput2cbg(seqs,orfs,coords,nodes, matrix = None, minimal_overall_spanning_range_size = 3, verbose=False): """ @type seqs: dict @param seqs: dict with ORGANISM IDENTIFIER as keys, sequences as values @type orfs: dict @param orfs: dict with ORGANISM IDENTIFIER as keys, Orf objects as values @type coords: dict @param coords: dict with ORGANISM IDENTIFIER as keys, [ sta, end ] as values @type nodes: list @param nodes: list with nodes corresponding to the ORGANISM IDENTIFIER in the dictionaries @attention: coordinates in coords should correspond to the sequneces in seqs! """ # do clustalw and strip_alignment_for_exterior_gaps (algseqs,algm) = clustalw(seqs=seqs) #################################################################### if verbose: print seqs, "\n", algseqs, "\n", algm, "\n", coords #################################################################### _testalgseqs,_testalgm,_testcoords = strip_alignment_for_exterior_gaps( deepcopy(algseqs),deepcopy(algm),deepcopy(coords)) if not _testalgm: #################################################################### if verbose: print "NO ALGM\n", seqs, "\n", _testalgseqs, "\n", _testalgm #################################################################### # alignment completely vanished by `strip_alignment_for_exterior_gaps` return None # do required import here (prevent circular imports) from graphAbgp.graph_codingblock import CodingBlockGraph from graphAbgp.exceptions import NoOverallMinimalSpanningRange from pacb import conversion as pacbconversion if not matrix: raise "No ProteinSimilarityMatrix applied!" # translate the clustalw alignment into an artificial CBG newcbg = CodingBlockGraph() newcbg.add_nodes(nodes) pacbp_is_none = False for nodeA,nodeB in newcbg.pairwisecrosscombinations_node(): orgA = newcbg.organism_by_node(nodeA) orgB = newcbg.organism_by_node(nodeB) # create stripped alignments for this pair of sequences # do not forget to make deepcopies of the data structures! subcoords = { orgA: coords[orgA], orgB: coords[orgB] } subalgseqs = { orgA: algseqs[orgA], orgB: algseqs[orgB] } _algseqs,_algm,_coords = strip_alignment_for_exterior_gaps( deepcopy(subalgseqs),deepcopy(algm),deepcopy(subcoords) ) # recreate a pairwise ClustalW alignment string _algm = make_clustalw_alignment_match( _algseqs[orgA],_algseqs[orgB], matrix = matrix.matrix ) # _algseqs keys are organisms, not nodes! alignment = ( _algseqs[orgA], _algm, _algseqs[orgB] ) paircoords = ( _coords[orgA][0], _coords[orgA][1], _coords[orgB][0], _coords[orgB][1] ) pacbp = pacbconversion.pacbp_from_clustalw( alignment=alignment,coords=paircoords) if pacbp == None: # pacbp is not creatable -> break i.o.t. return None pacbp_is_none = True break pacbporf = pacbconversion.pacbp2pacbporf(pacbp,orfs[orgA],orfs[orgB]) #################################################################### if verbose: print orgA, orgB, pacbporf for item in alignment: print item print paircoords #################################################################### wt = pacbporf.bitscore pacbpkey = pacbporf.construct_unique_key(nodeA,nodeB) newcbg.add_edge(nodeA,nodeB,wt=wt) newcbg.pacbps[(pacbpkey,nodeA,nodeB)] = pacbporf # check if all pacbporfs are created succesfully if pacbp_is_none: return None # update edge weight by OMSR and return newcbg.MINIMAL_OVERAL_SPANNING_RANGE_SIZE =\ minimal_overall_spanning_range_size if newcbg.has_overall_minimal_spanning_range(): newcbg.update_edge_weights_by_minimal_spanning_range() try: newcbg.correct_pacbpgaps_nearby_omsr() return newcbg except NoOverallMinimalSpanningRange: return None else: return None
def WORKING_sprdif2clustalw2cbg(cbg,sprdif,SCAFFOLD_GAP_OMSR_OFFSET=0,verbose=False): """ """ # gather sequence concerning the scaffold gap of the mutual nodes seqs, orfs, coords = {}, {}, {} for node in sprdif.keys(): org = cbg.organism_by_node(node) sta = min( sprdif[node] ) - SCAFFOLD_GAP_OMSR_OFFSET end = max( sprdif[node] ) + SCAFFOLD_GAP_OMSR_OFFSET orf = cbg.get_orfs_of_graph(organism=org)[0] seq = orf.getaas(abs_pos_start=sta,abs_pos_end=end) seqs[org] = seq orfs[org] = orf coords[org] = [sta,end] # do clustalw and strip_alignment_for_exterior_gaps (_algseqs,_algm) = clustalw(seqs=seqs) #################################################################### if verbose: print seqs, "\n", _algseqs, "\n", _algm #################################################################### _algseqs,_algm,coords = strip_alignment_for_exterior_gaps(_algseqs,_algm,coords) if not _algm: #################################################################### if verbose: print "NO ALGM.??\n", seqs, "\n", _algseqs, "\n", _algm #################################################################### # alignment completely vanished by `strip_alignment_for_exterior_gaps` return None # do required import here (prevent circular imports) from graphAbgp.graph_codingblock import CodingBlockGraph from graphAbgp.exceptions import NoOverallMinimalSpanningRange from pacb import conversion as pacbconversion from lib_cexpander import cexpander_checkCBG4omsrbordergaps, ZeroUniformlyAlignedPositions # translate the clustalw alignment into an artificial CBG newcbg = CodingBlockGraph() newcbg.add_nodes(sprdif.keys()) pacbp_is_none = False for nodeA,nodeB in newcbg.pairwisecrosscombinations_node(): orgA = cbg.organism_by_node(nodeA) orgB = cbg.organism_by_node(nodeB) # _algseqs keys are organisms, not nodes! alignment = ( _algseqs[orgA], _algm, _algseqs[orgB] ) paircoords = ( coords[orgA][0], coords[org][1], coords[orgB][0], coords[orgB][1] ) pacbp = pacbconversion.pacbp_from_clustalw(alignment=alignment,coords=paircoords) if pacbp == None: # pacbp is not creatable -> break i.o.t. return None pacbp_is_none = True break pacbporf = pacbconversion.pacbp2pacbporf(pacbp,orfs[orgA],orfs[orgB]) wt = pacbporf.bitscore pacbpkey = pacbporf.construct_unique_key(nodeA,nodeB) newcbg.add_edge(nodeA,nodeB,wt=wt) newcbg.pacbps[(pacbpkey,nodeA,nodeB)] = pacbporf # check if all pacbporfs are created succesfully if pacbp_is_none: return None # update edge weight by OMSR and return newcbg.MINIMAL_OVERAL_SPANNING_RANGE_SIZE = 3 if newcbg.has_overall_minimal_spanning_range(): newcbg.update_edge_weights_by_minimal_spanning_range() try: newcbg.correct_pacbpgaps_nearby_omsr() return newcbg except NoOverallMinimalSpanningRange: return None #try: # status = cexpander_checkCBG4omsrbordergaps(newcbg) # return newcbg #except NoOverallMinimalSpanningRange: # return None #except ZeroUniformlyAlignedPositions: # return None #except: # return None else: return None
def WORKING_sprdif2clustalw2cbg(cbg,sprdif,SCAFFOLD_GAP_OMSR_OFFSET=1,verbose=False): """ """ # gather sequence concerning the scaffold gap of the mutual nodes seqs, orfs, coords = {}, {}, {} for node in sprdif.keys(): org = cbg.organism_by_node(node) sta = min( sprdif[node] ) - SCAFFOLD_GAP_OMSR_OFFSET end = max( sprdif[node] ) + SCAFFOLD_GAP_OMSR_OFFSET orf = cbg.get_orfs_of_graph(organism=org)[0] # correct a priori for out-of-range exceptions # due to SCAFFOLD_GAP_OMSR_OFFSET sta = max([ sta, orf.protein_startPY ]) end = min([ end, orf.protein_endPY ]) seq = orf.getaas(abs_pos_start=sta,abs_pos_end=end) seqs[org] = seq orfs[org] = orf coords[org] = [sta,end] # do clustalw and strip_alignment_for_exterior_gaps (algseqs,algm) = clustalw(seqs=seqs) #################################################################### if verbose: print seqs, "\n", algseqs, "\n", algm, "\n", coords #################################################################### _testalgseqs,_testalgm,_testcoords = strip_alignment_for_exterior_gaps( deepcopy(algseqs),deepcopy(algm),deepcopy(coords)) if not _testalgm: #################################################################### if verbose: print "NO ALGM\n", seqs, "\n", _testalgseqs, "\n", _testalgm #################################################################### # alignment completely vanished by `strip_alignment_for_exterior_gaps` return None # do required import here (prevent circular imports) from graphAbgp.graph_codingblock import CodingBlockGraph from graphAbgp.exceptions import NoOverallMinimalSpanningRange from pacb import conversion as pacbconversion from lib_cexpander import cexpander_checkCBG4omsrbordergaps, ZeroUniformlyAlignedPositions # translate the clustalw alignment into an artificial CBG newcbg = CodingBlockGraph() newcbg.add_nodes(sprdif.keys()) pacbp_is_none = False for nodeA,nodeB in newcbg.pairwisecrosscombinations_node(): orgA = cbg.organism_by_node(nodeA) orgB = cbg.organism_by_node(nodeB) # create stripped alignments for this pair of sequences # do not forget to make deepcopies of the data structures! subcoords = { orgA: coords[orgA], orgB: coords[orgB] } subalgseqs = { orgA: algseqs[orgA], orgB: algseqs[orgB] } _algseqs,_algm,_coords = strip_alignment_for_exterior_gaps( deepcopy(subalgseqs),deepcopy(algm),deepcopy(subcoords) ) # get a/the ProteinSimilarityMatrix from the original PacbP(ORF) # and then recreate a pairwise ClustalW alignment string protsimmtrx = cbg.get_pacbps_by_nodes(node1=nodeA,node2=nodeB)[0].MATRIX _algm = make_clustalw_alignment_match( _algseqs[orgA],_algseqs[orgB], matrix = protsimmtrx.matrix ) # _algseqs keys are organisms, not nodes! alignment = ( _algseqs[orgA], _algm, _algseqs[orgB] ) paircoords = ( _coords[orgA][0], _coords[orgA][1], _coords[orgB][0], _coords[orgB][1] ) pacbp = pacbconversion.pacbp_from_clustalw( alignment=alignment,coords=paircoords) if pacbp == None: # pacbp is not creatable -> break i.o.t. return None pacbp_is_none = True break pacbporf = pacbconversion.pacbp2pacbporf(pacbp,orfs[orgA],orfs[orgB]) #################################################################### if verbose: print orgA, orgB, pacbporf for item in alignment: print item print paircoords #################################################################### wt = pacbporf.bitscore pacbpkey = pacbporf.construct_unique_key(nodeA,nodeB) newcbg.add_edge(nodeA,nodeB,wt=wt) newcbg.pacbps[(pacbpkey,nodeA,nodeB)] = pacbporf # check if all pacbporfs are created succesfully if pacbp_is_none: return None # update edge weight by OMSR and return newcbg.MINIMAL_OVERAL_SPANNING_RANGE_SIZE = 3 if newcbg.has_overall_minimal_spanning_range(): newcbg.update_edge_weights_by_minimal_spanning_range() try: newcbg.correct_pacbpgaps_nearby_omsr() return newcbg except NoOverallMinimalSpanningRange: return None else: return None
def merge_pacbporfs_with_closeby_independant_introns(pacbporfD, pacbporfA, verbose=False, **kwargs): """ Merge 2 PacbPORF objects by closeby independant gained introns @attention: see pacb.connecting.merge_orfs_with_intron for **kwargs) @type pacbporfD: PacbPORF object @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects @type pacbporfA: PacbPORF object @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects @type verbose: Boolean @param verbose: print status/debugging messages to STDOUT @rtype: list @return: list with ( intronQ, intronS, CIGexonPacbPORF ) """ # input validation IsPacbPORF(pacbporfD) IsPacbPORF(pacbporfA) # edit **kwargs dictionary for some forced attributes kwargs['allow_phase_shift'] = True _update_kwargs(kwargs, KWARGS_CLOSEBY_INDEPENDANT_INTRON_GAIN) if not kwargs.has_key('aligned_site_max_triplet_distance'): kwargs['aligned_site_max_triplet_distance'] = kwargs[ 'cig_max_aa_length'] # run regular merge_pacbporfs_with_introns function alg_introns = merge_pacbporfs_with_introns(pacbporfD, pacbporfA, verbose=verbose, **kwargs) cig_introns = [] if verbose: print "introns::", len(alg_introns), "cig_max_aa_length:", kwargs[ 'cig_max_aa_length'], kwargs['aligned_site_max_triplet_distance'] # check if there is length congruence between the cig_introns for intQ, intS in alg_introns: dQpos, dQphase = pacbporfD.dnaposition_query(intQ.donor.pos, forced_return=True) dSpos, dSphase = pacbporfD.dnaposition_sbjct(intS.donor.pos, forced_return=True) aQpos, aQphase = pacbporfA.dnaposition_query(intQ.acceptor.pos, forced_return=True) aSpos, aSphase = pacbporfA.dnaposition_sbjct(intS.acceptor.pos, forced_return=True) distDnt = (dQpos * 3 + dQphase) - (dSpos * 3 + dSphase) distAnt = (aQpos * 3 + aQphase) - (aSpos * 3 + aSphase) ######################################################################## if verbose: print(intQ.donor.pos, intQ.acceptor.pos), print(intS.donor.pos, intS.acceptor.pos), print distDnt, distAnt, kwargs['max_nt_offset'] ######################################################################## if abs(distDnt - distAnt) > kwargs['max_nt_offset']: # intermediate ciigPacbPORF has query vs sbjct length discrepancy # *3 for AA2nt coordinate conversion, +2 to allow different phases # e.g. phase difference can give 1AA+2nt difference continue if intQ.donor.phase == intS.donor.phase and\ (distDnt/3) <= kwargs['aligned_site_max_triplet_distance']: # a regularly merged intron combination continue if intQ.acceptor.phase == intS.acceptor.phase and\ (distAnt/3) <= kwargs['aligned_site_max_triplet_distance']: # a regularly merged intron combination continue if abs(distDnt) <= 5 or abs(distDnt) <= 5: # most likely a splice site phase shift, not a c.i.g. continue if abs(distDnt/3) >= kwargs['cig_min_aa_length'] and\ abs(distAnt/3) >= kwargs['cig_min_aa_length'] and\ abs(distDnt/3) <= kwargs['cig_max_aa_length'] and\ abs(distAnt/3) <= kwargs['cig_max_aa_length']: # putatively a closeby independant (intron) gain cig_introns.append((intQ, intS)) ############################################################################ if verbose: for intQ, intS in cig_introns: print "cig?:", (intQ.donor.pos, intQ.acceptor.pos), print(intS.donor.pos, intS.acceptor.pos) ############################################################################ # return variable to store found positive cases of CIG into found_cig_list = [] # check if there is some sequence similarity for intQ, intS in cig_introns: # get alignment positions around query & sbjcts splice sites dQpos, dQphase = pacbporfD.dnaposition_query(intQ.donor.pos, forced_return=True) dSpos, dSphase = pacbporfD.dnaposition_sbjct(intS.donor.pos, forced_return=True) aQpos, aQphase = pacbporfA.dnaposition_query(intQ.acceptor.pos, forced_return=True) aSpos, aSphase = pacbporfA.dnaposition_sbjct(intS.acceptor.pos, forced_return=True) distD = dQpos - dSpos distA = aQpos - aSpos distDnt = (dQpos * 3 + dQphase) - (dSpos * 3 + dSphase) distAnt = (aQpos * 3 + aQphase) - (aSpos * 3 + aSphase) if distDnt > 0: # then, distAnt is as well > 0 # QUERY is extended on the donor side #mode = "SQ" #qStart = pacbporfD._positions[dSpos].query_pos #qEnd = qStart + distD #sStart = pacbporfA._positions[aSpos].sbjct_pos #sEnd = sStart + distD #qSeq = pacbporfD.orfQ.getaas(abs_pos_start=qStart,abs_pos_end=qEnd) #sSeq = pacbporfA.orfS.getaas(abs_pos_start=sStart,abs_pos_end=sEnd) mode = "SQ" qEnd = pacbporfD.orfQ.dnapos2aapos(intQ.donor.pos) qStart = qEnd - max([distA, distD]) sStart = pacbporfA.orfS.dnapos2aapos(intS.acceptor.pos) sEnd = sStart + max([distA, distD]) qSeq = pacbporfD.orfQ.getaas(abs_pos_start=qStart, abs_pos_end=qEnd) sSeq = pacbporfA.orfS.getaas(abs_pos_start=sStart, abs_pos_end=sEnd) else: # distDnt and distAnt are < 0 ## SBJCT is extended on the donor site #mode = "QS" #qStart = pacbporfA._positions[aQpos].query_pos #qEnd = qStart - distA #sStart = pacbporfD._positions[dQpos].sbjct_pos #sEnd = sStart - distA #qSeq = pacbporfA.orfQ.getaas(abs_pos_start=qStart, abs_pos_end=qEnd) #sSeq = pacbporfD.orfS.getaas(abs_pos_start=sStart, abs_pos_end=sEnd) mode = "QS" qStart = pacbporfA.orfQ.dnapos2aapos(intQ.acceptor.pos) qEnd = qStart - min([distA, distD]) sEnd = pacbporfD.orfS.dnapos2aapos(intS.donor.pos) sStart = sEnd + min([distA, distD]) qSeq = pacbporfA.orfQ.getaas(abs_pos_start=qStart, abs_pos_end=qEnd) sSeq = pacbporfD.orfS.getaas(abs_pos_start=sStart, abs_pos_end=sEnd) headerQ = "query_%s_%s_%s" % (qStart, qEnd, qSeq) headerS = "sbjct_%s_%s_%s" % (sStart, sEnd, sSeq) headerQ = headerQ[0:20] # truncate to prevent error headerS = headerS[0:20] # truncate to prevent error if verbose: print mode, ( distD, distA), qSeq, sSeq, headerQ, headerS, distDnt, distAnt, print dQpos, aQpos, dSpos, aSpos if not qSeq: continue # superfluous check-doublecheck for sequence if not sSeq: continue # superfluous check-doublecheck for sequence #################################################### # make PacbPORF with ClustalW #################################################### # align the sequences with clustalw seqs = {headerQ: qSeq, headerS: sSeq} (alignedseqs, alignment) = clustalw(seqs=seqs) # make pacbp from clustalw alignment pacbp = pacbp_from_clustalw(alignment=(alignedseqs[headerQ], alignment, alignedseqs[headerS]), coords=(qStart, qEnd, sStart, sEnd)) if not pacbp: continue # strip unaligned fraction of this pacbp object, then check length pacbp.strip_unmatched_ends() if len(pacbp) < kwargs['cig_min_aa_length']: continue if len(pacbp) > kwargs['cig_max_aa_length']: continue if pacbp: # initialize extended tiny PacbPORF caused by c.i.g. if distDnt > 0: cig_pacbporf = pacbp2pacbporf(pacbp, pacbporfD.orfQ, pacbporfA.orfS) else: cig_pacbporf = pacbp2pacbporf(pacbp, pacbporfA.orfQ, pacbporfD.orfS) cig_pacbporf.extend_pacbporf_after_stops() #################################################################### if verbose: print pacbp, len(pacbp) print cig_pacbporf print "CIG:", intQ print "CIG:", intS print distD, distA, distDnt, distAnt cig_pacbporf.print_protein_and_dna() #################################################################### #################################################################### # set some meta-data properties to the intron objects #################################################################### # add distance score to introns # The distance set in merge_pacbporfs_with_introns is large; # it is the actual distance between the splice sites. In CIG, # the measure for distance is the length difference between # the offset between query and sbjct measured on the cig_pacbporf intQ._distance = abs(distDnt - distAnt) intS._distance = abs(distDnt - distAnt) if distDnt > 0: # then, distAnt is as well > 0 # QUERY is extended on the donor side # add Alignment Positional Periphery Score into objects succes = set_apps_intron_query(intQ, cig_pacbporf, pacbporfA) succes = set_apps_intron_sbjct(intS, pacbporfD, cig_pacbporf) else: # SBJCT is extended on the donor side # add Alignment Positional Periphery Score into objects succes = set_apps_intron_query(intQ, pacbporfD, cig_pacbporf) succes = set_apps_intron_sbjct(intS, cig_pacbporf, pacbporfA) # set GFF fsource attribute for recognition of intron sources intQ._gff['fsource'] = "ABGPcig" intS._gff['fsource'] = "ABGPcig" # create _linked_to_xxx attributes intQ._linked_to_pacbporfs = [cig_pacbporf] intS._linked_to_pacbporfs = [cig_pacbporf] # append to found_cig_list found_cig_list.append((intQ, intS, cig_pacbporf)) else: # no alignment possible -> try next continue # return lists of closeby_independant_introns return found_cig_list