def is_hmmpacbporf_conflicting_with_pacbporflist(hmmpacbporf, pacbporflist): """ """ IS_HMMPACBP_CONFLICTING = False for pacbporf in pacbporflist: # check if positioned compatibly if not pacbporf.is_postioned_compatibly(hmmpacbporf): overlap = False # init printing variable IS_HMMPACBP_CONFLICTING = True break # check if not overlapping overlap = pacbporf.overlap(hmmpacbporf) if overlap == 0.0: pass elif overlap <= 0.25: # correct for slightly overlapping PacbPORFS # Lazy... not willing to check orientation of # PacbPs here; let the overlap function handle it thispacbp = pacbporf2pacbp(pacbporf) hmmpacbp = pacbporf2pacbp(hmmpacbporf) _prev, _next = order_pacbp_list([thispacbp, hmmpacbp]) _prev, _next, status1 = correct_overlap_for_sbjct(_prev, _next, verbose=False) _prev, _next, status2 = correct_overlap_for_query(_prev, _next, verbose=False) if hmmpacbp.length == 0: IS_HMMPACBP_CONFLICTING = True break if thispacbp.length == 0: print "FatalWarning: HMM overlap caused PacbPORF to dissapear" IS_HMMPACBP_CONFLICTING = True break # Okay! Convert back to the pacbporf & the hmmpacbporf hmmpacbporf = pacbp2pacbporf(hmmpacbp, hmmpacbporf.orfQ, hmmpacbporf.orfS) else: IS_HMMPACBP_CONFLICTING = True break # return binary outcome of overlap conflict return IS_HMMPACBP_CONFLICTING
def is_hmmpacbporf_conflicting_with_pacbporflist(hmmpacbporf,pacbporflist): """ """ IS_HMMPACBP_CONFLICTING = False for pacbporf in pacbporflist: # check if positioned compatibly if not pacbporf.is_postioned_compatibly(hmmpacbporf): overlap = False # init printing variable IS_HMMPACBP_CONFLICTING = True break # check if not overlapping overlap = pacbporf.overlap(hmmpacbporf) if overlap == 0.0: pass elif overlap <= 0.25: # correct for slightly overlapping PacbPORFS # Lazy... not willing to check orientation of # PacbPs here; let the overlap function handle it thispacbp = pacbporf2pacbp(pacbporf) hmmpacbp = pacbporf2pacbp(hmmpacbporf) _prev,_next = order_pacbp_list([thispacbp,hmmpacbp]) _prev, _next, status1 = correct_overlap_for_sbjct( _prev, _next , verbose=False ) _prev, _next, status2 = correct_overlap_for_query( _prev, _next , verbose=False) if hmmpacbp.length == 0: IS_HMMPACBP_CONFLICTING = True break if thispacbp.length == 0: print "FatalWarning: HMM overlap caused PacbPORF to dissapear" IS_HMMPACBP_CONFLICTING = True break # Okay! Convert back to the pacbporf & the hmmpacbporf hmmpacbporf = pacbp2pacbporf(hmmpacbp, hmmpacbporf.orfQ,hmmpacbporf.orfS) else: IS_HMMPACBP_CONFLICTING = True break # return binary outcome of overlap conflict return IS_HMMPACBP_CONFLICTING
def _merge_pacbporfs_by_two_tinyexons(pacbporfD,pacbporfA, orfSetObject,queryorsbjct,verbose = False, **kwargs): """ """ # edit **kwargs dictionary for some forced attributes _update_kwargs(kwargs,KWARGS_PROJECTED_TINYEXON) tinyexons = [] sposD = pacbporfD._get_original_alignment_pos_start() eposD = pacbporfD._get_original_alignment_pos_end() sposA = pacbporfA._get_original_alignment_pos_start() eposA = pacbporfA._get_original_alignment_pos_end() if queryorsbjct == "query": donorOrf = pacbporfD.orfQ accepOrf = pacbporfA.orfQ prjctOrf = pacbporfD.orfS dStart,dEnd = sposD.query_dna_start, eposD.query_dna_end aStart,aEnd = sposA.query_dna_start, eposA.query_dna_end elif queryorsbjct == "sbjct": donorOrf = pacbporfD.orfS accepOrf = pacbporfA.orfS prjctOrf = pacbporfD.orfQ dStart,dEnd = sposD.sbjct_dna_start, eposD.sbjct_dna_end aStart,aEnd = sposA.sbjct_dna_start, eposA.sbjct_dna_end else: message = "'queryorsbjct' (%s), not 'query' or 'sbjct'" % queryorsbjct raise InproperlyAppliedArgument, message # get all potential combinations of two tinyexons tinyexoncombis = merge_orfs_with_two_tinyexons( donorOrf, accepOrf, donorOrf._donor_sites, accepOrf._acceptor_sites, orfSetObject.orfs, ) results = [] for dObj in donorOrf._donor_sites: if queryorsbjct == "query": (dPos,dPhase) = pacbporfD.dnaposition_query(dObj.pos,forced_return=True) else: (dPos,dPhase) = pacbporfD.dnaposition_sbjct(dObj.pos,forced_return=True) try: algDobj = pacbporfD._positions[dPos] except IndexError: # site out of range of PacbPORF -> break break # check if dObj is on pfD; # introns of tinyexons can be projected outside of pfD/pfA area if dObj.pos < dStart: continue for aObj in accepOrf._acceptor_sites: if queryorsbjct == "query": (aPos,aPhase) = pacbporfA.dnaposition_query(aObj.pos,forced_return=True) else: (aPos,aPhase) = pacbporfA.dnaposition_sbjct(aObj.pos,forced_return=True) try: algAobj = pacbporfA._positions[aPos] except IndexError: # site out of range of PacbPORF -> break break # check if aObj is on pfA; # introns of tinyexons can be projected outside of pfD/pfA area if aObj.pos > aEnd: continue if queryorsbjct == "query": posDsbjct = algDobj.sbjct_dna_start + dPhase posAsbjct = algAobj.sbjct_dna_start + aPhase else: posDsbjct = algDobj.query_dna_start + dPhase posAsbjct = algAobj.query_dna_start + aPhase distance = posAsbjct - posDsbjct if distance >= (kwargs['max_tinyexon_nt_length']*2): break if distance < (kwargs['min_tinyexon_nt_length']*2): continue filtered_tinyexoncombis = _filter_tinyexoncombis(tinyexoncombis, min_length = distance, max_length = distance, min_first_acceptor_pos = dObj.pos + kwargs['min_tinyexon_intron_nt_length'], max_final_donor_pos = aObj.pos - kwargs['min_tinyexon_intron_nt_length'], phase_final_donor = aObj.phase, phase_first_acceptor= dObj.phase, ) if not filtered_tinyexoncombis: continue #################################################################### if verbose: print distance, dObj, aObj, len(tinyexoncombis), print len(filtered_tinyexoncombis) #################################################################### for exon1,intron,exon2 in filtered_tinyexoncombis: # make preceding intron preceding_intron = IntronConnectingOrfs( dObj,exon1.acceptor, None,donorOrf,exon1.orf ) # make subsequent intron subsequent_intron = IntronConnectingOrfs( exon2.donor, aObj, None,exon2.orf,accepOrf) ################################################################ if verbose: print "\t", exon1, exon1.proteinsequence(), print preceding_intron.phase, exon1.donor.phase, print subsequent_intron.phase, preceding_intron.shared_aa, print intron.shared_aa, subsequent_intron.shared_aa print "\t", exon2, exon2.proteinsequence() ################################################################ # get prjctOrf sequence for comparison correctionA = 0 if aObj.phase != 0: # INCLUDE the final AA which is broken by the splicesite correctionA=1 if queryorsbjct == "query": startPos,_phase = pacbporfD.dnaposition_query(dObj.pos,forced_return=True) stopPos,_phase = pacbporfA.dnaposition_query(aObj.pos,forced_return=True) start = pacbporfD._positions[startPos].sbjct_pos stop = pacbporfA._positions[stopPos].sbjct_pos + correctionA else: startPos,_phase = pacbporfD.dnaposition_sbjct(dObj.pos,forced_return=True) stopPos,_phase = pacbporfA.dnaposition_sbjct(aObj.pos,forced_return=True) start = pacbporfD._positions[startPos].query_pos stop = pacbporfA._positions[stopPos].query_pos + correctionA if stop <= start: # tinyexon is so tiny that is does not have a single # full aligned AA -> discard here continue # actually get the prjctOrf sequence aaseq = prjctOrf.getaas(abs_pos_start=start,abs_pos_end=stop) # initialize a PacbP for the combination of both tinyexons # afterwards, check if the indentityscore is > 0.XX from pacb import PacbP seqparts = [ preceding_intron.shared_aa, exon1.proteinsequence(), intron.shared_aa, exon2.proteinsequence(), subsequent_intron.shared_aa ] ################################################################ if verbose or len("".join(seqparts)) != len(aaseq): print pacbporfD print exon1.orf, exon2.orf, prjctOrf print pacbporfA print seqparts print aaseq, len(aaseq), len("".join(seqparts)), (start,stop) print "'%s'" % queryorsbjct, print "Q", (algDobj.query_pos, algAobj.query_pos), print "S", (algDobj.sbjct_pos, algAobj.sbjct_pos) print "distance:", distance, kwargs['max_tinyexon_nt_length'], print (posDsbjct, posAsbjct), print "Q-dna:", ( algDobj.query_dna_start, dPhase, algAobj.query_dna_start, aPhase ), print "S-dna:", ( algDobj.sbjct_dna_start, dPhase, algAobj.sbjct_dna_start, aPhase ) ################################################################ # ignore by continue when sequences not identical in length if len("".join(seqparts)) != len(aaseq): continue testpacbp = PacbP(input=( "".join(seqparts), aaseq, 0, 0) ) testpacbp.strip_unmatched_ends() if not ( testpacbp.identityscore > 0.60 and\ (float(testpacbp.length) / len(aaseq)) > 0.70 ): # not a very convincing alignment continue ################################################################ if verbose: print testpacbp testpacbp.print_protein() ################################################################ # if here, succesfully mapped 2 tiny exons!! # get all sequences/coordinates in place for # pacbporf formation orfQ1 = exon1.orf orfS1 = prjctOrf orfQ2 = exon2.orf orfS2 = prjctOrf seqQ1 = exon1.proteinsequence() seqQ2 = exon2.proteinsequence() coordQ1 = exon1.acceptor.pos / 3 coordS1 = start coordQ2 = exon2.acceptor.pos / 3 coordS2 = start + len(seqparts[0]) + len(seqparts[1]) + len(seqparts[2]) seqS1 = aaseq[0:(len(seqparts[0])+len(seqparts[1]))] seqS2 = aaseq[-(len(seqparts[3])+len(seqparts[4])):] if len(seqparts[0]): seqS1 = seqS1[1:] coordS1 += 1 if len(seqparts[4]): seqS2 = seqS2[:-1] if queryorsbjct == "sbjct": # swap query <-> sbjct orfQ1,orfS1 = orfS1,orfQ1 orfQ2,orfS2 = orfS2,orfQ2 seqQ1,seqS1 = seqS1,seqQ1 seqQ2,seqS2 = seqS2,seqQ2 coordQ1,coordS1 = coordS1,coordQ1 coordQ2,coordS2 = coordS2,coordQ2 ################################################################ if verbose: print "tinypacbporf1:", seqQ1, seqQ2, coordQ1, coordQ2 print "tinypacbporf2:", seqS1, seqS2, coordS1, coordS2 ################################################################ # make pacbporfs pacbp1 = PacbP(input=( seqQ1, seqS1, coordQ1, coordS1) ) pacbp1.strip_unmatched_ends() tinypacbporf1 = pacbp2pacbporf(pacbp1,orfQ1,orfS1) tinypacbporf1.extend_pacbporf_after_stops() pacbp2 = PacbP(input=( seqQ2, seqS2, coordQ2, coordS2) ) pacbp2.strip_unmatched_ends() tinypacbporf2 = pacbp2pacbporf(pacbp2,orfQ2,orfS2) tinypacbporf2.extend_pacbporf_after_stops() ################################################################ if verbose: print tinypacbporf1 tinypacbporf1.print_protein_and_dna() print tinypacbporf2 tinypacbporf2.print_protein_and_dna() ################################################################ ################################################################ # set some meta-data properties to the intron objects ################################################################ # add distance score to intron preceding_intron._distance = 0 intron._distance = 0 subsequent_intron._distance = 0 # add Alignment Positional Periphery Score into objects if queryorsbjct == "query": succes = set_apps_intron_query(preceding_intron,pacbporfD,tinypacbporf1) succes = set_apps_intron_query(intron,tinypacbporf1,tinypacbporf2) succes = set_apps_intron_query(subsequent_intron,tinypacbporf2,pacbporfA) else: succes = set_apps_intron_sbjct(preceding_intron,pacbporfD,tinypacbporf1) succes = set_apps_intron_sbjct(intron,tinypacbporf1,tinypacbporf2) succes = set_apps_intron_sbjct(subsequent_intron,tinypacbporf2,pacbporfA) # set GFF fsource attribute for recognition of intron sources preceding_intron._gff['fsource'] = "ABGPprojectingTE" intron._gff['fsource'] = "ABGPprojectingTE" subsequent_intron._gff['fsource'] = "ABGPprojectingTE" # create _linked_to_xxx attributes preceding_intron._linked_to_pacbporfs = [ tinypacbporf1, tinypacbporf2 ] intron._linked_to_pacbporfs = [ tinypacbporf1, tinypacbporf2 ] subsequent_intron._linked_to_pacbporfs = [ tinypacbporf1, tinypacbporf2 ] preceding_intron._linked_to_introns = [ intron,subsequent_intron ] intron._linked_to_introns = [ preceding_intron,subsequent_intron ] subsequent_intron._linked_to_introns = [ intron,preceding_intron ] ################################################################ # append to results ################################################################ results.append( ( preceding_intron, intron, subsequent_intron, tinypacbporf1, tinypacbporf2, ) ) # return 3 introns and 2 intermediate tinyexon PacbPORFs (per row) return results
def _merge_pacbporfs_by_tinyexon_and_two_introns(pacbporfD,pacbporfA, orfSetObject,queryorsbjct,verbose = False, **kwargs): """ Merge 2 PacbPORF objects by introns @attention: see pacb.connecting.merge_orfs_with_intron for **kwargs) @type pacbporfD: PacbPORF object @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects @type pacbporfA: PacbPORF object @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects @type orfSetObject: object with elegiable Orfs @param orfSetObject: object with elegiable Orfs @type queryorsbjct: string @param queryorsbjct: literal string 'query' or 'sbjct' @type verbose: Boolean @param verbose: print debugging info to STDOUT when True @rtype: list @return: list with ( intron, ExonOnOrf, intron ) on the query sequence """ # input validation IsPacbPORF(pacbporfD) IsPacbPORF(pacbporfA) # edit **kwargs dictionary for some forced attributes _update_kwargs(kwargs,KWARGS_PROJECTED_TINYEXON) MAX_TINYEXON_NT_LENGTH = 33 MIN_TINYEXON_NT_LENGTH = 6 tinyexons = [] if queryorsbjct == "query": donorOrf = pacbporfD.orfQ accepOrf = pacbporfA.orfQ prjctOrf = pacbporfD.orfS alignedDonorRange = pacbporfD.alignment_dna_range_query() alignedAccepRange = pacbporfA.alignment_dna_range_query() elif queryorsbjct == "sbjct": donorOrf = pacbporfD.orfS accepOrf = pacbporfA.orfS prjctOrf = pacbporfD.orfQ alignedDonorRange = pacbporfD.alignment_dna_range_sbjct() alignedAccepRange = pacbporfA.alignment_dna_range_sbjct() else: message = "'queryorsbjct' (%s), not 'query' or 'sbjct'" % queryorsbjct raise InproperlyAppliedArgument, message for dObj in donorOrf._donor_sites: # do not make a projection OVER the aligned area if dObj.pos < min(alignedDonorRange): continue if queryorsbjct == "query": (dPos,dPhase) = pacbporfD.dnaposition_query(dObj.pos,forced_return=True) else: (dPos,dPhase) = pacbporfD.dnaposition_sbjct(dObj.pos,forced_return=True) try: algDobj = pacbporfD._positions[dPos] except IndexError: # site out of range of PacbPORF -> break break for aObj in accepOrf._acceptor_sites: # do not make a projection OVER the aligned area if aObj.pos > max(alignedAccepRange): continue if queryorsbjct == "query": (aPos,aPhase) = pacbporfA.dnaposition_query(aObj.pos,forced_return=True) else: (aPos,aPhase) = pacbporfA.dnaposition_sbjct(aObj.pos,forced_return=True) try: algAobj = pacbporfA._positions[aPos] except IndexError: # site out of range of PacbPORF -> break break if queryorsbjct == "query": posDsbjct = algDobj.sbjct_dna_start + dPhase posAsbjct = algAobj.sbjct_dna_start + aPhase else: posDsbjct = algDobj.query_dna_start + dPhase posAsbjct = algAobj.query_dna_start + aPhase distance = posAsbjct - posDsbjct if distance >= MAX_TINYEXON_NT_LENGTH: break if distance < MIN_TINYEXON_NT_LENGTH: continue #################################################### # generate a ScanForMatches pattern file #################################################### # example pattern: 6...6 AG NNGNNANNANNGN[2,0,0] GT 3...3 query = list(prjctOrf.inputgenomicsequence[posDsbjct:posAsbjct]) # mask all non-phase0 nucleotides to N residues; # this represents the regularexpression for a specific # peptide sequence firstphasepositions = range( 3-dPhase % 3, len(query), 3) for pos in range(0,len(query)): if pos not in firstphasepositions: query[pos] = "N" # calculate a ~50% mismatch number mismatches = max([ 0, (len(query) - query.count("N"))/2 ]) # write the pattern to string and subsequently to file # example pattern: 6...6 AG NNGNNANNANNGN[2,0,0] GT 3...3 if kwargs['allow_non_canonical_donor']: sfmpat = "%s...%s AG %s[%s,0,0] G (T | C) %s...%s" % ( AUSO,AUSO,"".join(query),mismatches,DDSO,DDSO) else: sfmpat = "%s...%s AG %s[%s,0,0] GT %s...%s" % ( AUSO,AUSO,"".join(query),mismatches,DDSO,DDSO) #################################################### if verbose: print (pacbporfD.orfQ.id,pacbporfA.orfQ.id), print distance, dObj, aObj print sfmpat #################################################### fname = "sfmpat_tinyexon_%s_%s_%s_%s" % ( donorOrf.id, accepOrf.id, posDsbjct, posAsbjct, ) fh = open(fname,'w') fh.write(sfmpat+"\n") fh.close() #################################################### # run ScanForMatches #################################################### command = """echo ">myseq\n%s" | %s %s | tr "[,]" "\t\t#" | """ +\ """tr -d "\n " | sed "s/>/\\n>/g" | tr "#" "\t" | """ +\ """awk -F'\t' '{ if (NF==4 && $2>%s && $3<%s) """ +\ """{ print $1"["$2","$3"]\\n"$4 } }' """ command = command % ( donorOrf.inputgenomicsequence, EXECUTABLE_SFM,fname, dObj.pos+(kwargs['min_intron_nt_length']-3), aObj.pos-(kwargs['min_intron_nt_length']-3) ) co = osPopen(command) matches = parseFasta(co.readlines()) co.close() # filter matches for: # (1) correct donor & acceptor phase # (2) high enough donor & acceptor site scores for hdr,seqmatch in matches.iteritems(): startQ,stopQ = [ int(item) for item in hdr.split(":")[1][1:-1].split(",") ] exonQstart = startQ + AUSO + 2 - 1 exonQstop = stopQ - DDSO - 2 #################################### # get Orf object of tinyexon #################################### tinyexonorf = None # select the Orf on which the tinyexon is located for orfObj in orfSetObject.get_elegiable_orfs( max_orf_start=exonQstart,min_orf_end=exonQstop): orfPhase = (exonQstart - orfObj.startPY) % 3 if orfPhase == dPhase: tinyexonorf = orfObj break else: # No tinyexonorf assigned!! Iin case a regex matched # over a STOP-codon or the regex length is smaller # then the smallest Orf, no Orf can be assigned continue # filter for donor & acceptor score dScore = _score_splice_site(seqmatch[-9:],splicetype='donor') aScore = _score_splice_site(seqmatch[0:11],splicetype='acceptor') if dScore < kwargs['min_donor_pssm_score']: continue if aScore < kwargs['min_acceptor_pssm_score']: continue # scan Orf for splicesites tinyexonorf.scan_orf_for_pssm_splice_sites( splicetype="donor", min_pssm_score=kwargs['min_donor_pssm_score'], allow_non_canonical=kwargs['allow_non_canonical_donor'], non_canonical_min_pssm_score=kwargs['non_canonical_min_donor_pssm_score']) tinyexonorf.scan_orf_for_pssm_splice_sites( splicetype="acceptor", min_pssm_score=kwargs['min_acceptor_pssm_score'], allow_non_canonical=kwargs['allow_non_canonical_acceptor'], non_canonical_min_pssm_score=kwargs['non_canonical_min_acceptor_pssm_score']) # get 1th intron donor object intron1_aObj = None for a in tinyexonorf._acceptor_sites: if a.pos == exonQstart: intron1_aObj = a break else: # pseudo-acceptorsite as found be SFM regex # is not a valid acceptor site of high enough score # continue to next iteration of (hdr,seqmatch) pair continue # get 2th intron donor object intron2_dObj = None for d in tinyexonorf._donor_sites: if d.pos == exonQstop: intron2_dObj = d break else: # pseudo-donorsite as found be SFM regex # is not a valid acceptor site of high enough score # continue to next iteration of (hdr,seqmatch) pair continue # check if introns are of elegiable lengths if (intron1_aObj.pos-dObj.pos) > kwargs['max_intron_nt_length']: continue if (aObj.pos-intron2_dObj.pos) > kwargs['max_intron_nt_length']: continue #################################################### if True or verbose: # if here, a candidate!!! print (pacbporfD.orfQ.id,tinyexonorf.id,pacbporfA.orfQ.id), print hdr, dScore, aScore print seqmatch #################################################### # append to found tinyexons query_data = ( tinyexonorf, exonQstart, exonQstop ) sbjct_data = ( prjctOrf, posDsbjct, posAsbjct ) splicesite_data = ( dObj,intron1_aObj, intron2_dObj, aObj ) tinyexons.append( ( query_data, sbjct_data, splicesite_data ) ) # file cleanup osRemove(fname) # return - End Of Function - if no tinyexons are found if not tinyexons: return [] #################################### # select the **best** tinyexon #################################### (query_data,sbjct_data,splicesite_data) = tinyexons[0] orfQ,query_dna_start,query_dna_end = query_data orfS,sbjct_dna_start,sbjct_dna_end = sbjct_data (intron1_dObj,intron1_aObj,intron2_dObj,intron2_aObj) = splicesite_data #################################################### if verbose: print "tinyexon orf:", orfQ print "tinyexon orf:", intron1_aObj print "tinyexon orf:", intron2_dObj #################################################### #################################### # make tinyexon PacbPORF #################################### startQaa = orfQ.dnapos2aapos(query_dna_start) -1 startSaa = orfS.dnapos2aapos(sbjct_dna_start) -1 stopQaa = orfQ.dnapos2aapos(query_dna_end) +1 stopSaa = orfS.dnapos2aapos(sbjct_dna_end) +1 # check for directly leading stop codon on tinyexon while startQaa <= orfQ.protein_startPY: startQaa+=1 startSaa+=1 query_dna_start+=3 sbjct_dna_start+=3 while startSaa <= orfS.protein_startPY: startQaa+=1 startSaa+=1 query_dna_start+=3 sbjct_dna_start+=3 # check for directly tailing stop codon on tinyexon while stopQaa > orfQ.protein_endPY: stopQaa-=1 stopSaa-=1 query_dna_end-=3 sbjct_dna_end-=3 while stopSaa > orfS.protein_endPY: stopQaa-=1 stopSaa-=1 query_dna_end-=3 sbjct_dna_end-=3 # get sequences qAAseq = orfQ.getaas(abs_pos_start=startQaa,abs_pos_end=stopQaa) sAAseq = orfS.getaas(abs_pos_start=startSaa,abs_pos_end=stopSaa) #################################################### if verbose or len(qAAseq) != len(sAAseq): # if unequal lengths, error will be raised upon PacbP.__init__() print orfQ, qAAseq, startQaa, stopQaa, (stopQaa-startQaa), print (query_dna_start,query_dna_end) print orfS, sAAseq, startSaa, stopSaa, (stopSaa-startSaa), print (sbjct_dna_start,sbjct_dna_end) print orfQ.inputgenomicsequence[query_dna_start-2:query_dna_end+2] print orfS.inputgenomicsequence[sbjct_dna_start-2:sbjct_dna_end+2] #################################################### # initialize extended tinyexon PacbPORF from pacb import PacbP pacbp = PacbP(input=( qAAseq, sAAseq, startQaa, startSaa ) ) pacbp.strip_unmatched_ends() pacbporf = pacbp2pacbporf(pacbp,orfQ,orfS) pacbporf.extend_pacbporf_after_stops() pacbporf.source = 'ABGPprojectingTE' #################################### # make introns #################################### intron1 = IntronConnectingOrfs( intron1_dObj, intron1_aObj, None, donorOrf,pacbporf.orfQ ) intron2 = IntronConnectingOrfs( intron2_dObj, intron2_aObj, None, pacbporf.orfQ, accepOrf ) ################################################################ # set some meta-data properties to the intron objects ################################################################ # add distance score to intron intron1._distance = 0 intron2._distance = 0 # add Alignment Positional Periphery Score into objects if queryorsbjct == "query": succes = set_apps_intron_query(intron1,pacbporfD,pacbporf) succes = set_apps_intron_query(intron2,pacbporf,pacbporfA) else: succes = set_apps_intron_sbjct(intron1,pacbporfD,pacbporf) succes = set_apps_intron_sbjct(intron2,pacbporf,pacbporfA) # set GFF fsource attribute for recognition of intron sources intron1._gff['fsource'] = "ABGPprojectingTE" intron2._gff['fsource'] = "ABGPprojectingTE" # create _linked_to_xxx attributes intron1._linked_to_pacbporfs = [ pacbporf ] intron2._linked_to_pacbporfs = [ pacbporf ] intron1._linked_to_introns = [ intron2 ] intron2._linked_to_introns = [ intron1 ] #################################################### if verbose: print pacbporf pacbporf.print_protein_and_dna() print intron1 print intron2 if False: # printing data when this function needs to be debugged: print "" print intron1 print intron2 print "" print pacbporfD pacbporfD.print_protein_and_dna() print "" print pacbporf pacbporf.print_protein_and_dna() print "" print pacbporfA pacbporfA.print_protein_and_dna() import sys sys.exit() #################################################### # return introns and intermediate tinyexon PacbPORF return [(intron1,intron2,pacbporf)]
def _find_qq_tinyexons_as_pacbporfs(target, tinyexondata, PCG, min_discovery_count=2): """ """ target_tinyexon_pacbporf_data = {} for informant in tinyexondata.keys(): if informant == target: continue thepacbporfs = order_pacbporf_list( PCG.get_pacbps_by_organisms(target, informant)) for exonQ in tinyexondata[target]: if exonQ.orf.id in [pf.orfQ.id for pf in thepacbporfs]: continue for (prevpos, nextpos) in [(pos - 1, pos) for pos in range(1, len(thepacbporfs))]: prevPF = thepacbporfs[prevpos] nextPF = thepacbporfs[nextpos] if prevPF.orfS.id == nextPF.orfS.id: # check if PacbPORFs are positioned more or less okay if prevPF.distance_towards(nextPF) > 20: continue # check if exonQ is positioned ~between these PacbPORFs if exonQ.orf.dnapos2aapos(exonQ.end) < max( prevPF.alignment_protein_range_query()) - 12: continue if exonQ.orf.dnapos2aapos(exonQ.start) > min( nextPF.alignment_protein_range_query()) + 12: continue # check if gap can be projected already by a perfect intron introns = merge_pacbporfs_by_intron_in_query( prevPF, nextPF, max_aa_offset=1) # if introns found => continue if introns: continue # orfObj is the orfS of prevPF or nextPF (just take any) orfObj = prevPF.orfS # assign elegiable range of tinyexon match on SBJCT aapos_sbjct_range = range( max(prevPF.alignment_protein_range_sbjct()) - 12, min(nextPF.alignment_protein_range_sbjct()) + 12) tinyexonmatches = _find_match_on_orfobj(exonQ, orfObj) for (aaseq, aapos) in tinyexonmatches: # check if the match is obtained in the expected # sbjct AA range; if not, ignore the match if aapos not in aapos_sbjct_range: continue # make pacbporf object pacbpobj = PacbP( input=(exonQ.proteinsequence(), aaseq, exonQ.orf.dnapos2aapos(exonQ.start), aapos)) pacbporfobj = pacbp2pacbporf(pacbpobj, exonQ.orf, orfObj) pacbporfobj.extend_pacbporf_after_stops() # remove included pacbporfs is_suborsuperset = False for accepted_pacbporf in thepacbporfs: if pacbporfobj.issubsetorsuperset( accepted_pacbporf): is_suborsuperset = True break if is_suborsuperset: continue # check if 2 (perfect) introns can be projected introns5p = merge_pacbporfs_by_intron_in_query( prevPF, pacbporfobj, max_aa_offset=1, max_intron_nt_length=None) #max_intron_nt_length=140) introns3p = merge_pacbporfs_by_intron_in_query( pacbporfobj, nextPF, max_aa_offset=1, max_intron_nt_length=None) #max_intron_nt_length=140) # continue if not is_confirmed_by_intron_projection if not introns5p or not introns3p: continue # check if placeable in PCG/pacbporflist distPrev = prevPF.distance_towards(pacbporfobj) distNext = pacbporfobj.distance_towards(nextPF) ovrlPrev = pacbporfobj.overlap(prevPF) ovrlNext = pacbporfobj.overlap(nextPF) if distPrev and distNext: rejected = False elif not distPrev and ovrlPrev: rejected = False elif not distNext and ovrlNext: rejected = False elif ovrlPrev and ovrlNext: rejected = False else: rejected = True print "OKAY", exonQ.proteinsequence( ), aaseq, rejected, informant, (distPrev, distNext, ovrlPrev, ovrlNext) # label pacbporf as found by tinyexon QQ pacbporfobj._tinyexon_label = "QQ" # store to target_tinyexon_pacbporf_data key = (exonQ.proteinsequence(), exonQ.start) _update_tinyexon_pacbporf_dict( target_tinyexon_pacbporf_data, key, pacbporfobj, rejected, informant) # cleanup tinyexon protein matches that have been observed to litte _remove_dict_elements_with_short_value_list( target_tinyexon_pacbporf_data, min_value_list_size=min_discovery_count) # return target_tinyexon_pacbporf_data return target_tinyexon_pacbporf_data
def _find_qp_and_pq_tinyexons_as_pacbporfs(target, tinyexondata, PCG, min_discovery_count=2): """ """ target_tinyexon_pacbporf_data = {} for informant in tinyexondata.keys(): if informant == target: continue thepacbporfs = order_pacbporf_list( PCG.get_pacbps_by_organisms(target, informant)) for exonQ in tinyexondata[target]: if exonQ.orf.id in [pf.orfQ.id for pf in thepacbporfs]: continue for orfObj in PCG.get_orfs_of_graph(organism=informant): tinyexonmatches = _find_qp_or_pq_match_on_orfobj(exonQ, orfObj) for (aaseq, aapos) in tinyexonmatches: # make pacbporf object pacbpobj = PacbP( input=(exonQ.proteinsequence(), aaseq, exonQ.orf.dnapos2aapos(exonQ.start), aapos)) pacbporfobj = pacbp2pacbporf(pacbpobj, exonQ.orf, orfObj) pacbporfobj.extend_pacbporf_after_stops() # remove included pacbporfs is_suborsuperset = False for accepted_pacbporf in thepacbporfs: if pacbporfobj.issubsetorsuperset(accepted_pacbporf): is_suborsuperset = True break if is_suborsuperset: continue # check if a (perfect) intron can be projected is_confirmed_by_intron_projection = False for accepted_pacbporf in thepacbporfs: if accepted_pacbporf.orfS.id == pacbporfobj.orfS.id: if min(accepted_pacbporf.alignment_dna_range_query( )) > min(pacbporfobj.alignment_dna_range_query()): try: introns = merge_pacbporfs_by_intron_in_query( pacbporfobj, accepted_pacbporf, max_aa_offset=0, max_intron_nt_length=None) #max_intron_nt_length=140) except IndexError: # unexpected event: TODO: solve in merge_pacbporfs_by_intron_in_query introns = [] else: try: introns = merge_pacbporfs_by_intron_in_query( accepted_pacbporf, pacbporfobj, max_aa_offset=0, max_intron_nt_length=None) #max_intron_nt_length=140) except IndexError: # unexpected event: TODO: solve in merge_pacbporfs_by_intron_in_query introns = [] if len(introns) >= 1: is_confirmed_by_intron_projection = True break # continue if not is_confirmed_by_intron_projection if not is_confirmed_by_intron_projection: continue # check if placeable in PCG/pacbporflist rejected = [ pf.is_postioned_compatibly(pacbporfobj) for pf in thepacbporfs ].count(False) > 0 # label pacbporf as found by tinyexon QP pacbporfobj._tinyexon_label = "QP" # store to target_tinyexon_pacbporf_data key = (exonQ.proteinsequence(), exonQ.start) _update_tinyexon_pacbporf_dict( target_tinyexon_pacbporf_data, key, pacbporfobj, rejected, informant) # cleanup tinyexon protein matches that have been observed to litte _remove_dict_elements_with_short_value_list( target_tinyexon_pacbporf_data, min_value_list_size=min_discovery_count) # return target_tinyexon_pacbporf_data return target_tinyexon_pacbporf_data
def merge_pacbporfs_with_closeby_independant_introns(pacbporfD,pacbporfA, verbose=False,**kwargs): """ Merge 2 PacbPORF objects by closeby independant gained introns @attention: see pacb.connecting.merge_orfs_with_intron for **kwargs) @type pacbporfD: PacbPORF object @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects @type pacbporfA: PacbPORF object @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects @type verbose: Boolean @param verbose: print status/debugging messages to STDOUT @rtype: list @return: list with ( intronQ, intronS, CIGexonPacbPORF ) """ # input validation IsPacbPORF(pacbporfD) IsPacbPORF(pacbporfA) # edit **kwargs dictionary for some forced attributes kwargs['allow_phase_shift'] = True _update_kwargs(kwargs,KWARGS_CLOSEBY_INDEPENDANT_INTRON_GAIN) if not kwargs.has_key('aligned_site_max_triplet_distance'): kwargs['aligned_site_max_triplet_distance'] = kwargs['cig_max_aa_length'] # run regular merge_pacbporfs_with_introns function alg_introns = merge_pacbporfs_with_introns(pacbporfD,pacbporfA,verbose=verbose,**kwargs) cig_introns = [] if verbose: print "introns::", len(alg_introns), "cig_max_aa_length:", kwargs['cig_max_aa_length'], kwargs['aligned_site_max_triplet_distance'] # check if there is length congruence between the cig_introns for intQ,intS in alg_introns: dQpos, dQphase = pacbporfD.dnaposition_query(intQ.donor.pos,forced_return=True) dSpos, dSphase = pacbporfD.dnaposition_sbjct(intS.donor.pos,forced_return=True) aQpos, aQphase = pacbporfA.dnaposition_query(intQ.acceptor.pos,forced_return=True) aSpos, aSphase = pacbporfA.dnaposition_sbjct(intS.acceptor.pos,forced_return=True) distDnt = (dQpos*3 + dQphase) - (dSpos*3 + dSphase) distAnt = (aQpos*3 + aQphase) - (aSpos*3 + aSphase) ######################################################################## if verbose: print (intQ.donor.pos, intQ.acceptor.pos), print (intS.donor.pos, intS.acceptor.pos), print distDnt, distAnt, kwargs['max_nt_offset'] ######################################################################## if abs(distDnt-distAnt) > kwargs['max_nt_offset']: # intermediate ciigPacbPORF has query vs sbjct length discrepancy # *3 for AA2nt coordinate conversion, +2 to allow different phases # e.g. phase difference can give 1AA+2nt difference continue if intQ.donor.phase == intS.donor.phase and\ (distDnt/3) <= kwargs['aligned_site_max_triplet_distance']: # a regularly merged intron combination continue if intQ.acceptor.phase == intS.acceptor.phase and\ (distAnt/3) <= kwargs['aligned_site_max_triplet_distance']: # a regularly merged intron combination continue if abs(distDnt) <= 5 or abs(distDnt) <= 5: # most likely a splice site phase shift, not a c.i.g. continue if abs(distDnt/3) >= kwargs['cig_min_aa_length'] and\ abs(distAnt/3) >= kwargs['cig_min_aa_length'] and\ abs(distDnt/3) <= kwargs['cig_max_aa_length'] and\ abs(distAnt/3) <= kwargs['cig_max_aa_length']: # putatively a closeby independant (intron) gain cig_introns.append( ( intQ, intS ) ) ############################################################################ if verbose: for intQ,intS in cig_introns: print "cig?:", (intQ.donor.pos, intQ.acceptor.pos), print (intS.donor.pos, intS.acceptor.pos) ############################################################################ # return variable to store found positive cases of CIG into found_cig_list = [] # check if there is some sequence similarity for intQ,intS in cig_introns: # get alignment positions around query & sbjcts splice sites dQpos, dQphase = pacbporfD.dnaposition_query(intQ.donor.pos,forced_return=True) dSpos, dSphase = pacbporfD.dnaposition_sbjct(intS.donor.pos,forced_return=True) aQpos, aQphase = pacbporfA.dnaposition_query(intQ.acceptor.pos,forced_return=True) aSpos, aSphase = pacbporfA.dnaposition_sbjct(intS.acceptor.pos,forced_return=True) distD = dQpos - dSpos distA = aQpos - aSpos distDnt = (dQpos*3 + dQphase) - (dSpos*3 + dSphase) distAnt = (aQpos*3 + aQphase) - (aSpos*3 + aSphase) if distDnt > 0: # then, distAnt is as well > 0 # QUERY is extended on the donor side #mode = "SQ" #qStart = pacbporfD._positions[dSpos].query_pos #qEnd = qStart + distD #sStart = pacbporfA._positions[aSpos].sbjct_pos #sEnd = sStart + distD #qSeq = pacbporfD.orfQ.getaas(abs_pos_start=qStart,abs_pos_end=qEnd) #sSeq = pacbporfA.orfS.getaas(abs_pos_start=sStart,abs_pos_end=sEnd) mode = "SQ" qEnd = pacbporfD.orfQ.dnapos2aapos(intQ.donor.pos) qStart= qEnd - max([distA,distD]) sStart= pacbporfA.orfS.dnapos2aapos(intS.acceptor.pos) sEnd = sStart + max([distA,distD]) qSeq = pacbporfD.orfQ.getaas(abs_pos_start=qStart,abs_pos_end=qEnd) sSeq = pacbporfA.orfS.getaas(abs_pos_start=sStart,abs_pos_end=sEnd) else: # distDnt and distAnt are < 0 ## SBJCT is extended on the donor site #mode = "QS" #qStart = pacbporfA._positions[aQpos].query_pos #qEnd = qStart - distA #sStart = pacbporfD._positions[dQpos].sbjct_pos #sEnd = sStart - distA #qSeq = pacbporfA.orfQ.getaas(abs_pos_start=qStart, abs_pos_end=qEnd) #sSeq = pacbporfD.orfS.getaas(abs_pos_start=sStart, abs_pos_end=sEnd) mode = "QS" qStart= pacbporfA.orfQ.dnapos2aapos(intQ.acceptor.pos) qEnd = qStart - min([distA,distD]) sEnd = pacbporfD.orfS.dnapos2aapos(intS.donor.pos) sStart= sEnd + min([distA,distD]) qSeq = pacbporfA.orfQ.getaas(abs_pos_start=qStart,abs_pos_end=qEnd) sSeq = pacbporfD.orfS.getaas(abs_pos_start=sStart,abs_pos_end=sEnd) headerQ = "query_%s_%s_%s" % (qStart,qEnd,qSeq) headerS = "sbjct_%s_%s_%s" % (sStart,sEnd,sSeq) headerQ = headerQ[0:20] # truncate to prevent error headerS = headerS[0:20] # truncate to prevent error if verbose: print mode, (distD,distA), qSeq, sSeq, headerQ, headerS, distDnt, distAnt, print dQpos, aQpos, dSpos, aSpos if not qSeq: continue # superfluous check-doublecheck for sequence if not sSeq: continue # superfluous check-doublecheck for sequence #################################################### # make PacbPORF with ClustalW #################################################### # align the sequences with clustalw seqs = { headerQ: qSeq, headerS: sSeq } (alignedseqs,alignment) = clustalw(seqs=seqs) # make pacbp from clustalw alignment pacbp = pacbp_from_clustalw( alignment=( alignedseqs[headerQ], alignment, alignedseqs[headerS] ), coords=(qStart,qEnd,sStart,sEnd) ) if not pacbp: continue # strip unaligned fraction of this pacbp object, then check length pacbp.strip_unmatched_ends() if len(pacbp) < kwargs['cig_min_aa_length']: continue if len(pacbp) > kwargs['cig_max_aa_length']: continue if pacbp: # initialize extended tiny PacbPORF caused by c.i.g. if distDnt > 0: cig_pacbporf = pacbp2pacbporf(pacbp,pacbporfD.orfQ,pacbporfA.orfS) else: cig_pacbporf = pacbp2pacbporf(pacbp,pacbporfA.orfQ,pacbporfD.orfS) cig_pacbporf.extend_pacbporf_after_stops() #################################################################### if verbose: print pacbp, len(pacbp) print cig_pacbporf print "CIG:", intQ print "CIG:", intS print distD, distA, distDnt, distAnt cig_pacbporf.print_protein_and_dna() #################################################################### #################################################################### # set some meta-data properties to the intron objects #################################################################### # add distance score to introns # The distance set in merge_pacbporfs_with_introns is large; # it is the actual distance between the splice sites. In CIG, # the measure for distance is the length difference between # the offset between query and sbjct measured on the cig_pacbporf intQ._distance = abs(distDnt-distAnt) intS._distance = abs(distDnt-distAnt) if distDnt > 0: # then, distAnt is as well > 0 # QUERY is extended on the donor side # add Alignment Positional Periphery Score into objects succes = set_apps_intron_query(intQ,cig_pacbporf,pacbporfA) succes = set_apps_intron_sbjct(intS,pacbporfD,cig_pacbporf) else: # SBJCT is extended on the donor side # add Alignment Positional Periphery Score into objects succes = set_apps_intron_query(intQ,pacbporfD,cig_pacbporf) succes = set_apps_intron_sbjct(intS,cig_pacbporf,pacbporfA) # set GFF fsource attribute for recognition of intron sources intQ._gff['fsource'] = "ABGPcig" intS._gff['fsource'] = "ABGPcig" # create _linked_to_xxx attributes intQ._linked_to_pacbporfs = [ cig_pacbporf ] intS._linked_to_pacbporfs = [ cig_pacbporf ] # append to found_cig_list found_cig_list.append( ( intQ, intS, cig_pacbporf ) ) else: # no alignment possible -> try next continue # return lists of closeby_independant_introns return found_cig_list
def merge_pacbporfs_with_closeby_independant_introns(pacbporfD, pacbporfA, verbose=False, **kwargs): """ Merge 2 PacbPORF objects by closeby independant gained introns @attention: see pacb.connecting.merge_orfs_with_intron for **kwargs) @type pacbporfD: PacbPORF object @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects @type pacbporfA: PacbPORF object @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects @type verbose: Boolean @param verbose: print status/debugging messages to STDOUT @rtype: list @return: list with ( intronQ, intronS, CIGexonPacbPORF ) """ # input validation IsPacbPORF(pacbporfD) IsPacbPORF(pacbporfA) # edit **kwargs dictionary for some forced attributes kwargs['allow_phase_shift'] = True _update_kwargs(kwargs, KWARGS_CLOSEBY_INDEPENDANT_INTRON_GAIN) if not kwargs.has_key('aligned_site_max_triplet_distance'): kwargs['aligned_site_max_triplet_distance'] = kwargs[ 'cig_max_aa_length'] # run regular merge_pacbporfs_with_introns function alg_introns = merge_pacbporfs_with_introns(pacbporfD, pacbporfA, verbose=verbose, **kwargs) cig_introns = [] if verbose: print "introns::", len(alg_introns), "cig_max_aa_length:", kwargs[ 'cig_max_aa_length'], kwargs['aligned_site_max_triplet_distance'] # check if there is length congruence between the cig_introns for intQ, intS in alg_introns: dQpos, dQphase = pacbporfD.dnaposition_query(intQ.donor.pos, forced_return=True) dSpos, dSphase = pacbporfD.dnaposition_sbjct(intS.donor.pos, forced_return=True) aQpos, aQphase = pacbporfA.dnaposition_query(intQ.acceptor.pos, forced_return=True) aSpos, aSphase = pacbporfA.dnaposition_sbjct(intS.acceptor.pos, forced_return=True) distDnt = (dQpos * 3 + dQphase) - (dSpos * 3 + dSphase) distAnt = (aQpos * 3 + aQphase) - (aSpos * 3 + aSphase) ######################################################################## if verbose: print(intQ.donor.pos, intQ.acceptor.pos), print(intS.donor.pos, intS.acceptor.pos), print distDnt, distAnt, kwargs['max_nt_offset'] ######################################################################## if abs(distDnt - distAnt) > kwargs['max_nt_offset']: # intermediate ciigPacbPORF has query vs sbjct length discrepancy # *3 for AA2nt coordinate conversion, +2 to allow different phases # e.g. phase difference can give 1AA+2nt difference continue if intQ.donor.phase == intS.donor.phase and\ (distDnt/3) <= kwargs['aligned_site_max_triplet_distance']: # a regularly merged intron combination continue if intQ.acceptor.phase == intS.acceptor.phase and\ (distAnt/3) <= kwargs['aligned_site_max_triplet_distance']: # a regularly merged intron combination continue if abs(distDnt) <= 5 or abs(distDnt) <= 5: # most likely a splice site phase shift, not a c.i.g. continue if abs(distDnt/3) >= kwargs['cig_min_aa_length'] and\ abs(distAnt/3) >= kwargs['cig_min_aa_length'] and\ abs(distDnt/3) <= kwargs['cig_max_aa_length'] and\ abs(distAnt/3) <= kwargs['cig_max_aa_length']: # putatively a closeby independant (intron) gain cig_introns.append((intQ, intS)) ############################################################################ if verbose: for intQ, intS in cig_introns: print "cig?:", (intQ.donor.pos, intQ.acceptor.pos), print(intS.donor.pos, intS.acceptor.pos) ############################################################################ # return variable to store found positive cases of CIG into found_cig_list = [] # check if there is some sequence similarity for intQ, intS in cig_introns: # get alignment positions around query & sbjcts splice sites dQpos, dQphase = pacbporfD.dnaposition_query(intQ.donor.pos, forced_return=True) dSpos, dSphase = pacbporfD.dnaposition_sbjct(intS.donor.pos, forced_return=True) aQpos, aQphase = pacbporfA.dnaposition_query(intQ.acceptor.pos, forced_return=True) aSpos, aSphase = pacbporfA.dnaposition_sbjct(intS.acceptor.pos, forced_return=True) distD = dQpos - dSpos distA = aQpos - aSpos distDnt = (dQpos * 3 + dQphase) - (dSpos * 3 + dSphase) distAnt = (aQpos * 3 + aQphase) - (aSpos * 3 + aSphase) if distDnt > 0: # then, distAnt is as well > 0 # QUERY is extended on the donor side #mode = "SQ" #qStart = pacbporfD._positions[dSpos].query_pos #qEnd = qStart + distD #sStart = pacbporfA._positions[aSpos].sbjct_pos #sEnd = sStart + distD #qSeq = pacbporfD.orfQ.getaas(abs_pos_start=qStart,abs_pos_end=qEnd) #sSeq = pacbporfA.orfS.getaas(abs_pos_start=sStart,abs_pos_end=sEnd) mode = "SQ" qEnd = pacbporfD.orfQ.dnapos2aapos(intQ.donor.pos) qStart = qEnd - max([distA, distD]) sStart = pacbporfA.orfS.dnapos2aapos(intS.acceptor.pos) sEnd = sStart + max([distA, distD]) qSeq = pacbporfD.orfQ.getaas(abs_pos_start=qStart, abs_pos_end=qEnd) sSeq = pacbporfA.orfS.getaas(abs_pos_start=sStart, abs_pos_end=sEnd) else: # distDnt and distAnt are < 0 ## SBJCT is extended on the donor site #mode = "QS" #qStart = pacbporfA._positions[aQpos].query_pos #qEnd = qStart - distA #sStart = pacbporfD._positions[dQpos].sbjct_pos #sEnd = sStart - distA #qSeq = pacbporfA.orfQ.getaas(abs_pos_start=qStart, abs_pos_end=qEnd) #sSeq = pacbporfD.orfS.getaas(abs_pos_start=sStart, abs_pos_end=sEnd) mode = "QS" qStart = pacbporfA.orfQ.dnapos2aapos(intQ.acceptor.pos) qEnd = qStart - min([distA, distD]) sEnd = pacbporfD.orfS.dnapos2aapos(intS.donor.pos) sStart = sEnd + min([distA, distD]) qSeq = pacbporfA.orfQ.getaas(abs_pos_start=qStart, abs_pos_end=qEnd) sSeq = pacbporfD.orfS.getaas(abs_pos_start=sStart, abs_pos_end=sEnd) headerQ = "query_%s_%s_%s" % (qStart, qEnd, qSeq) headerS = "sbjct_%s_%s_%s" % (sStart, sEnd, sSeq) headerQ = headerQ[0:20] # truncate to prevent error headerS = headerS[0:20] # truncate to prevent error if verbose: print mode, ( distD, distA), qSeq, sSeq, headerQ, headerS, distDnt, distAnt, print dQpos, aQpos, dSpos, aSpos if not qSeq: continue # superfluous check-doublecheck for sequence if not sSeq: continue # superfluous check-doublecheck for sequence #################################################### # make PacbPORF with ClustalW #################################################### # align the sequences with clustalw seqs = {headerQ: qSeq, headerS: sSeq} (alignedseqs, alignment) = clustalw(seqs=seqs) # make pacbp from clustalw alignment pacbp = pacbp_from_clustalw(alignment=(alignedseqs[headerQ], alignment, alignedseqs[headerS]), coords=(qStart, qEnd, sStart, sEnd)) if not pacbp: continue # strip unaligned fraction of this pacbp object, then check length pacbp.strip_unmatched_ends() if len(pacbp) < kwargs['cig_min_aa_length']: continue if len(pacbp) > kwargs['cig_max_aa_length']: continue if pacbp: # initialize extended tiny PacbPORF caused by c.i.g. if distDnt > 0: cig_pacbporf = pacbp2pacbporf(pacbp, pacbporfD.orfQ, pacbporfA.orfS) else: cig_pacbporf = pacbp2pacbporf(pacbp, pacbporfA.orfQ, pacbporfD.orfS) cig_pacbporf.extend_pacbporf_after_stops() #################################################################### if verbose: print pacbp, len(pacbp) print cig_pacbporf print "CIG:", intQ print "CIG:", intS print distD, distA, distDnt, distAnt cig_pacbporf.print_protein_and_dna() #################################################################### #################################################################### # set some meta-data properties to the intron objects #################################################################### # add distance score to introns # The distance set in merge_pacbporfs_with_introns is large; # it is the actual distance between the splice sites. In CIG, # the measure for distance is the length difference between # the offset between query and sbjct measured on the cig_pacbporf intQ._distance = abs(distDnt - distAnt) intS._distance = abs(distDnt - distAnt) if distDnt > 0: # then, distAnt is as well > 0 # QUERY is extended on the donor side # add Alignment Positional Periphery Score into objects succes = set_apps_intron_query(intQ, cig_pacbporf, pacbporfA) succes = set_apps_intron_sbjct(intS, pacbporfD, cig_pacbporf) else: # SBJCT is extended on the donor side # add Alignment Positional Periphery Score into objects succes = set_apps_intron_query(intQ, pacbporfD, cig_pacbporf) succes = set_apps_intron_sbjct(intS, cig_pacbporf, pacbporfA) # set GFF fsource attribute for recognition of intron sources intQ._gff['fsource'] = "ABGPcig" intS._gff['fsource'] = "ABGPcig" # create _linked_to_xxx attributes intQ._linked_to_pacbporfs = [cig_pacbporf] intS._linked_to_pacbporfs = [cig_pacbporf] # append to found_cig_list found_cig_list.append((intQ, intS, cig_pacbporf)) else: # no alignment possible -> try next continue # return lists of closeby_independant_introns return found_cig_list
def _merge_pacbporfs_by_two_tinyexons(pacbporfD, pacbporfA, orfSetObject, queryorsbjct, verbose=False, **kwargs): """ """ # edit **kwargs dictionary for some forced attributes _update_kwargs(kwargs, KWARGS_PROJECTED_TINYEXON) tinyexons = [] sposD = pacbporfD._get_original_alignment_pos_start() eposD = pacbporfD._get_original_alignment_pos_end() sposA = pacbporfA._get_original_alignment_pos_start() eposA = pacbporfA._get_original_alignment_pos_end() if queryorsbjct == "query": donorOrf = pacbporfD.orfQ accepOrf = pacbporfA.orfQ prjctOrf = pacbporfD.orfS dStart, dEnd = sposD.query_dna_start, eposD.query_dna_end aStart, aEnd = sposA.query_dna_start, eposA.query_dna_end elif queryorsbjct == "sbjct": donorOrf = pacbporfD.orfS accepOrf = pacbporfA.orfS prjctOrf = pacbporfD.orfQ dStart, dEnd = sposD.sbjct_dna_start, eposD.sbjct_dna_end aStart, aEnd = sposA.sbjct_dna_start, eposA.sbjct_dna_end else: message = "'queryorsbjct' (%s), not 'query' or 'sbjct'" % queryorsbjct raise InproperlyAppliedArgument, message # get all potential combinations of two tinyexons tinyexoncombis = merge_orfs_with_two_tinyexons( donorOrf, accepOrf, donorOrf._donor_sites, accepOrf._acceptor_sites, orfSetObject.orfs, ) results = [] for dObj in donorOrf._donor_sites: if queryorsbjct == "query": (dPos, dPhase) = pacbporfD.dnaposition_query(dObj.pos, forced_return=True) else: (dPos, dPhase) = pacbporfD.dnaposition_sbjct(dObj.pos, forced_return=True) try: algDobj = pacbporfD._positions[dPos] except IndexError: # site out of range of PacbPORF -> break break # check if dObj is on pfD; # introns of tinyexons can be projected outside of pfD/pfA area if dObj.pos < dStart: continue for aObj in accepOrf._acceptor_sites: if queryorsbjct == "query": (aPos, aPhase) = pacbporfA.dnaposition_query(aObj.pos, forced_return=True) else: (aPos, aPhase) = pacbporfA.dnaposition_sbjct(aObj.pos, forced_return=True) try: algAobj = pacbporfA._positions[aPos] except IndexError: # site out of range of PacbPORF -> break break # check if aObj is on pfA; # introns of tinyexons can be projected outside of pfD/pfA area if aObj.pos > aEnd: continue if queryorsbjct == "query": posDsbjct = algDobj.sbjct_dna_start + dPhase posAsbjct = algAobj.sbjct_dna_start + aPhase else: posDsbjct = algDobj.query_dna_start + dPhase posAsbjct = algAobj.query_dna_start + aPhase distance = posAsbjct - posDsbjct if distance >= (kwargs['max_tinyexon_nt_length'] * 2): break if distance < (kwargs['min_tinyexon_nt_length'] * 2): continue filtered_tinyexoncombis = _filter_tinyexoncombis( tinyexoncombis, min_length=distance, max_length=distance, min_first_acceptor_pos=dObj.pos + kwargs['min_tinyexon_intron_nt_length'], max_final_donor_pos=aObj.pos - kwargs['min_tinyexon_intron_nt_length'], phase_final_donor=aObj.phase, phase_first_acceptor=dObj.phase, ) if not filtered_tinyexoncombis: continue #################################################################### if verbose: print distance, dObj, aObj, len(tinyexoncombis), print len(filtered_tinyexoncombis) #################################################################### for exon1, intron, exon2 in filtered_tinyexoncombis: # make preceding intron preceding_intron = IntronConnectingOrfs( dObj, exon1.acceptor, None, donorOrf, exon1.orf) # make subsequent intron subsequent_intron = IntronConnectingOrfs( exon2.donor, aObj, None, exon2.orf, accepOrf) ################################################################ if verbose: print "\t", exon1, exon1.proteinsequence(), print preceding_intron.phase, exon1.donor.phase, print subsequent_intron.phase, preceding_intron.shared_aa, print intron.shared_aa, subsequent_intron.shared_aa print "\t", exon2, exon2.proteinsequence() ################################################################ # get prjctOrf sequence for comparison correctionA = 0 if aObj.phase != 0: # INCLUDE the final AA which is broken by the splicesite correctionA = 1 if queryorsbjct == "query": startPos, _phase = pacbporfD.dnaposition_query( dObj.pos, forced_return=True) stopPos, _phase = pacbporfA.dnaposition_query( aObj.pos, forced_return=True) start = pacbporfD._positions[startPos].sbjct_pos stop = pacbporfA._positions[stopPos].sbjct_pos + correctionA else: startPos, _phase = pacbporfD.dnaposition_sbjct( dObj.pos, forced_return=True) stopPos, _phase = pacbporfA.dnaposition_sbjct( aObj.pos, forced_return=True) start = pacbporfD._positions[startPos].query_pos stop = pacbporfA._positions[stopPos].query_pos + correctionA if stop <= start: # tinyexon is so tiny that is does not have a single # full aligned AA -> discard here continue # actually get the prjctOrf sequence aaseq = prjctOrf.getaas(abs_pos_start=start, abs_pos_end=stop) # initialize a PacbP for the combination of both tinyexons # afterwards, check if the indentityscore is > 0.XX from pacb import PacbP seqparts = [ preceding_intron.shared_aa, exon1.proteinsequence(), intron.shared_aa, exon2.proteinsequence(), subsequent_intron.shared_aa ] ################################################################ if verbose or len("".join(seqparts)) != len(aaseq): print pacbporfD print exon1.orf, exon2.orf, prjctOrf print pacbporfA print seqparts print aaseq, len(aaseq), len("".join(seqparts)), (start, stop) print "'%s'" % queryorsbjct, print "Q", (algDobj.query_pos, algAobj.query_pos), print "S", (algDobj.sbjct_pos, algAobj.sbjct_pos) print "distance:", distance, kwargs[ 'max_tinyexon_nt_length'], print(posDsbjct, posAsbjct), print "Q-dna:", (algDobj.query_dna_start, dPhase, algAobj.query_dna_start, aPhase), print "S-dna:", (algDobj.sbjct_dna_start, dPhase, algAobj.sbjct_dna_start, aPhase) ################################################################ # ignore by continue when sequences not identical in length if len("".join(seqparts)) != len(aaseq): continue testpacbp = PacbP(input=("".join(seqparts), aaseq, 0, 0)) testpacbp.strip_unmatched_ends() if not ( testpacbp.identityscore > 0.60 and\ (float(testpacbp.length) / len(aaseq)) > 0.70 ): # not a very convincing alignment continue ################################################################ if verbose: print testpacbp testpacbp.print_protein() ################################################################ # if here, succesfully mapped 2 tiny exons!! # get all sequences/coordinates in place for # pacbporf formation orfQ1 = exon1.orf orfS1 = prjctOrf orfQ2 = exon2.orf orfS2 = prjctOrf seqQ1 = exon1.proteinsequence() seqQ2 = exon2.proteinsequence() coordQ1 = exon1.acceptor.pos / 3 coordS1 = start coordQ2 = exon2.acceptor.pos / 3 coordS2 = start + len(seqparts[0]) + len(seqparts[1]) + len( seqparts[2]) seqS1 = aaseq[0:(len(seqparts[0]) + len(seqparts[1]))] seqS2 = aaseq[-(len(seqparts[3]) + len(seqparts[4])):] if len(seqparts[0]): seqS1 = seqS1[1:] coordS1 += 1 if len(seqparts[4]): seqS2 = seqS2[:-1] if queryorsbjct == "sbjct": # swap query <-> sbjct orfQ1, orfS1 = orfS1, orfQ1 orfQ2, orfS2 = orfS2, orfQ2 seqQ1, seqS1 = seqS1, seqQ1 seqQ2, seqS2 = seqS2, seqQ2 coordQ1, coordS1 = coordS1, coordQ1 coordQ2, coordS2 = coordS2, coordQ2 ################################################################ if verbose: print "tinypacbporf1:", seqQ1, seqQ2, coordQ1, coordQ2 print "tinypacbporf2:", seqS1, seqS2, coordS1, coordS2 ################################################################ # make pacbporfs pacbp1 = PacbP(input=(seqQ1, seqS1, coordQ1, coordS1)) pacbp1.strip_unmatched_ends() tinypacbporf1 = pacbp2pacbporf(pacbp1, orfQ1, orfS1) tinypacbporf1.extend_pacbporf_after_stops() pacbp2 = PacbP(input=(seqQ2, seqS2, coordQ2, coordS2)) pacbp2.strip_unmatched_ends() tinypacbporf2 = pacbp2pacbporf(pacbp2, orfQ2, orfS2) tinypacbporf2.extend_pacbporf_after_stops() ################################################################ if verbose: print tinypacbporf1 tinypacbporf1.print_protein_and_dna() print tinypacbporf2 tinypacbporf2.print_protein_and_dna() ################################################################ ################################################################ # set some meta-data properties to the intron objects ################################################################ # add distance score to intron preceding_intron._distance = 0 intron._distance = 0 subsequent_intron._distance = 0 # add Alignment Positional Periphery Score into objects if queryorsbjct == "query": succes = set_apps_intron_query(preceding_intron, pacbporfD, tinypacbporf1) succes = set_apps_intron_query(intron, tinypacbporf1, tinypacbporf2) succes = set_apps_intron_query(subsequent_intron, tinypacbporf2, pacbporfA) else: succes = set_apps_intron_sbjct(preceding_intron, pacbporfD, tinypacbporf1) succes = set_apps_intron_sbjct(intron, tinypacbporf1, tinypacbporf2) succes = set_apps_intron_sbjct(subsequent_intron, tinypacbporf2, pacbporfA) # set GFF fsource attribute for recognition of intron sources preceding_intron._gff['fsource'] = "ABGPprojectingTE" intron._gff['fsource'] = "ABGPprojectingTE" subsequent_intron._gff['fsource'] = "ABGPprojectingTE" # create _linked_to_xxx attributes preceding_intron._linked_to_pacbporfs = [ tinypacbporf1, tinypacbporf2 ] intron._linked_to_pacbporfs = [tinypacbporf1, tinypacbporf2] subsequent_intron._linked_to_pacbporfs = [ tinypacbporf1, tinypacbporf2 ] preceding_intron._linked_to_introns = [ intron, subsequent_intron ] intron._linked_to_introns = [ preceding_intron, subsequent_intron ] subsequent_intron._linked_to_introns = [ intron, preceding_intron ] ################################################################ # append to results ################################################################ results.append(( preceding_intron, intron, subsequent_intron, tinypacbporf1, tinypacbporf2, )) # return 3 introns and 2 intermediate tinyexon PacbPORFs (per row) return results
def _merge_pacbporfs_by_tinyexon_and_two_introns(pacbporfD, pacbporfA, orfSetObject, queryorsbjct, verbose=False, **kwargs): """ Merge 2 PacbPORF objects by introns @attention: see pacb.connecting.merge_orfs_with_intron for **kwargs) @type pacbporfD: PacbPORF object @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects @type pacbporfA: PacbPORF object @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects @type orfSetObject: object with elegiable Orfs @param orfSetObject: object with elegiable Orfs @type queryorsbjct: string @param queryorsbjct: literal string 'query' or 'sbjct' @type verbose: Boolean @param verbose: print debugging info to STDOUT when True @rtype: list @return: list with ( intron, ExonOnOrf, intron ) on the query sequence """ # input validation IsPacbPORF(pacbporfD) IsPacbPORF(pacbporfA) # edit **kwargs dictionary for some forced attributes _update_kwargs(kwargs, KWARGS_PROJECTED_TINYEXON) MAX_TINYEXON_NT_LENGTH = 33 MIN_TINYEXON_NT_LENGTH = 6 tinyexons = [] if queryorsbjct == "query": donorOrf = pacbporfD.orfQ accepOrf = pacbporfA.orfQ prjctOrf = pacbporfD.orfS alignedDonorRange = pacbporfD.alignment_dna_range_query() alignedAccepRange = pacbporfA.alignment_dna_range_query() elif queryorsbjct == "sbjct": donorOrf = pacbporfD.orfS accepOrf = pacbporfA.orfS prjctOrf = pacbporfD.orfQ alignedDonorRange = pacbporfD.alignment_dna_range_sbjct() alignedAccepRange = pacbporfA.alignment_dna_range_sbjct() else: message = "'queryorsbjct' (%s), not 'query' or 'sbjct'" % queryorsbjct raise InproperlyAppliedArgument, message for dObj in donorOrf._donor_sites: # do not make a projection OVER the aligned area if dObj.pos < min(alignedDonorRange): continue if queryorsbjct == "query": (dPos, dPhase) = pacbporfD.dnaposition_query(dObj.pos, forced_return=True) else: (dPos, dPhase) = pacbporfD.dnaposition_sbjct(dObj.pos, forced_return=True) try: algDobj = pacbporfD._positions[dPos] except IndexError: # site out of range of PacbPORF -> break break for aObj in accepOrf._acceptor_sites: # do not make a projection OVER the aligned area if aObj.pos > max(alignedAccepRange): continue if queryorsbjct == "query": (aPos, aPhase) = pacbporfA.dnaposition_query(aObj.pos, forced_return=True) else: (aPos, aPhase) = pacbporfA.dnaposition_sbjct(aObj.pos, forced_return=True) try: algAobj = pacbporfA._positions[aPos] except IndexError: # site out of range of PacbPORF -> break break if queryorsbjct == "query": posDsbjct = algDobj.sbjct_dna_start + dPhase posAsbjct = algAobj.sbjct_dna_start + aPhase else: posDsbjct = algDobj.query_dna_start + dPhase posAsbjct = algAobj.query_dna_start + aPhase distance = posAsbjct - posDsbjct if distance >= MAX_TINYEXON_NT_LENGTH: break if distance < MIN_TINYEXON_NT_LENGTH: continue #################################################### # generate a ScanForMatches pattern file #################################################### # example pattern: 6...6 AG NNGNNANNANNGN[2,0,0] GT 3...3 query = list(prjctOrf.inputgenomicsequence[posDsbjct:posAsbjct]) # mask all non-phase0 nucleotides to N residues; # this represents the regularexpression for a specific # peptide sequence firstphasepositions = range(3 - dPhase % 3, len(query), 3) for pos in range(0, len(query)): if pos not in firstphasepositions: query[pos] = "N" # calculate a ~50% mismatch number mismatches = max([0, (len(query) - query.count("N")) / 2]) # write the pattern to string and subsequently to file # example pattern: 6...6 AG NNGNNANNANNGN[2,0,0] GT 3...3 if kwargs['allow_non_canonical_donor']: sfmpat = "%s...%s AG %s[%s,0,0] G (T | C) %s...%s" % ( AUSO, AUSO, "".join(query), mismatches, DDSO, DDSO) else: sfmpat = "%s...%s AG %s[%s,0,0] GT %s...%s" % ( AUSO, AUSO, "".join(query), mismatches, DDSO, DDSO) #################################################### if verbose: print(pacbporfD.orfQ.id, pacbporfA.orfQ.id), print distance, dObj, aObj print sfmpat #################################################### fname = "sfmpat_tinyexon_%s_%s_%s_%s" % ( donorOrf.id, accepOrf.id, posDsbjct, posAsbjct, ) fh = open(fname, 'w') fh.write(sfmpat + "\n") fh.close() #################################################### # run ScanForMatches #################################################### command = """echo ">myseq\n%s" | %s %s | tr "[,]" "\t\t#" | """ +\ """tr -d "\n " | sed "s/>/\\n>/g" | tr "#" "\t" | """ +\ """awk -F'\t' '{ if (NF==4 && $2>%s && $3<%s) """ +\ """{ print $1"["$2","$3"]\\n"$4 } }' """ command = command % (donorOrf.inputgenomicsequence, EXECUTABLE_SFM, fname, dObj.pos + (kwargs['min_intron_nt_length'] - 3), aObj.pos - (kwargs['min_intron_nt_length'] - 3)) co = osPopen(command) matches = parseFasta(co.readlines()) co.close() # filter matches for: # (1) correct donor & acceptor phase # (2) high enough donor & acceptor site scores for hdr, seqmatch in matches.iteritems(): startQ, stopQ = [ int(item) for item in hdr.split(":")[1][1:-1].split(",") ] exonQstart = startQ + AUSO + 2 - 1 exonQstop = stopQ - DDSO - 2 #################################### # get Orf object of tinyexon #################################### tinyexonorf = None # select the Orf on which the tinyexon is located for orfObj in orfSetObject.get_eligible_orfs( max_orf_start=exonQstart, min_orf_end=exonQstop): orfPhase = (exonQstart - orfObj.startPY) % 3 if orfPhase == dPhase: tinyexonorf = orfObj break else: # No tinyexonorf assigned!! Iin case a regex matched # over a STOP-codon or the regex length is smaller # then the smallest Orf, no Orf can be assigned continue # filter for donor & acceptor score dScore = _score_splice_site(seqmatch[-9:], splicetype='donor') aScore = _score_splice_site(seqmatch[0:11], splicetype='acceptor') if dScore < kwargs['min_donor_pssm_score']: continue if aScore < kwargs['min_acceptor_pssm_score']: continue # scan Orf for splicesites tinyexonorf.scan_orf_for_pssm_splice_sites( splicetype="donor", min_pssm_score=kwargs['min_donor_pssm_score'], allow_non_canonical=kwargs['allow_non_canonical_donor'], non_canonical_min_pssm_score=kwargs[ 'non_canonical_min_donor_pssm_score']) tinyexonorf.scan_orf_for_pssm_splice_sites( splicetype="acceptor", min_pssm_score=kwargs['min_acceptor_pssm_score'], allow_non_canonical=kwargs['allow_non_canonical_acceptor'], non_canonical_min_pssm_score=kwargs[ 'non_canonical_min_acceptor_pssm_score']) # get 1th intron donor object intron1_aObj = None for a in tinyexonorf._acceptor_sites: if a.pos == exonQstart: intron1_aObj = a break else: # pseudo-acceptorsite as found be SFM regex # is not a valid acceptor site of high enough score # continue to next iteration of (hdr,seqmatch) pair continue # get 2th intron donor object intron2_dObj = None for d in tinyexonorf._donor_sites: if d.pos == exonQstop: intron2_dObj = d break else: # pseudo-donorsite as found be SFM regex # is not a valid acceptor site of high enough score # continue to next iteration of (hdr,seqmatch) pair continue # check if introns are of elegiable lengths if (intron1_aObj.pos - dObj.pos) > kwargs['max_intron_nt_length']: continue if (aObj.pos - intron2_dObj.pos) > kwargs['max_intron_nt_length']: continue #################################################### if True or verbose: # if here, a candidate!!! print(pacbporfD.orfQ.id, tinyexonorf.id, pacbporfA.orfQ.id), print hdr, dScore, aScore print seqmatch #################################################### # append to found tinyexons query_data = (tinyexonorf, exonQstart, exonQstop) sbjct_data = (prjctOrf, posDsbjct, posAsbjct) splicesite_data = (dObj, intron1_aObj, intron2_dObj, aObj) tinyexons.append((query_data, sbjct_data, splicesite_data)) # file cleanup osRemove(fname) # return - End Of Function - if no tinyexons are found if not tinyexons: return [] #################################### # select the **best** tinyexon #################################### (query_data, sbjct_data, splicesite_data) = tinyexons[0] orfQ, query_dna_start, query_dna_end = query_data orfS, sbjct_dna_start, sbjct_dna_end = sbjct_data (intron1_dObj, intron1_aObj, intron2_dObj, intron2_aObj) = splicesite_data #################################################### if verbose: print "tinyexon orf:", orfQ print "tinyexon orf:", intron1_aObj print "tinyexon orf:", intron2_dObj #################################################### #################################### # make tinyexon PacbPORF #################################### startQaa = orfQ.dnapos2aapos(query_dna_start) - 1 startSaa = orfS.dnapos2aapos(sbjct_dna_start) - 1 stopQaa = orfQ.dnapos2aapos(query_dna_end) + 1 stopSaa = orfS.dnapos2aapos(sbjct_dna_end) + 1 # check for directly leading stop codon on tinyexon while startQaa <= orfQ.protein_startPY: startQaa += 1 startSaa += 1 query_dna_start += 3 sbjct_dna_start += 3 while startSaa <= orfS.protein_startPY: startQaa += 1 startSaa += 1 query_dna_start += 3 sbjct_dna_start += 3 # check for directly tailing stop codon on tinyexon while stopQaa > orfQ.protein_endPY: stopQaa -= 1 stopSaa -= 1 query_dna_end -= 3 sbjct_dna_end -= 3 while stopSaa > orfS.protein_endPY: stopQaa -= 1 stopSaa -= 1 query_dna_end -= 3 sbjct_dna_end -= 3 # get sequences qAAseq = orfQ.getaas(abs_pos_start=startQaa, abs_pos_end=stopQaa) sAAseq = orfS.getaas(abs_pos_start=startSaa, abs_pos_end=stopSaa) #################################################### if verbose or len(qAAseq) != len(sAAseq): # if unequal lengths, error will be raised upon PacbP.__init__() print orfQ, qAAseq, startQaa, stopQaa, (stopQaa - startQaa), print(query_dna_start, query_dna_end) print orfS, sAAseq, startSaa, stopSaa, (stopSaa - startSaa), print(sbjct_dna_start, sbjct_dna_end) print orfQ.inputgenomicsequence[query_dna_start - 2:query_dna_end + 2] print orfS.inputgenomicsequence[sbjct_dna_start - 2:sbjct_dna_end + 2] #################################################### # initialize extended tinyexon PacbPORF from pacb import PacbP pacbp = PacbP(input=(qAAseq, sAAseq, startQaa, startSaa)) pacbp.strip_unmatched_ends() pacbporf = pacbp2pacbporf(pacbp, orfQ, orfS) pacbporf.extend_pacbporf_after_stops() pacbporf.source = 'ABGPprojectingTE' #################################### # make introns #################################### intron1 = IntronConnectingOrfs(intron1_dObj, intron1_aObj, None, donorOrf, pacbporf.orfQ) intron2 = IntronConnectingOrfs(intron2_dObj, intron2_aObj, None, pacbporf.orfQ, accepOrf) ################################################################ # set some meta-data properties to the intron objects ################################################################ # add distance score to intron intron1._distance = 0 intron2._distance = 0 # add Alignment Positional Periphery Score into objects if queryorsbjct == "query": succes = set_apps_intron_query(intron1, pacbporfD, pacbporf) succes = set_apps_intron_query(intron2, pacbporf, pacbporfA) else: succes = set_apps_intron_sbjct(intron1, pacbporfD, pacbporf) succes = set_apps_intron_sbjct(intron2, pacbporf, pacbporfA) # set GFF fsource attribute for recognition of intron sources intron1._gff['fsource'] = "ABGPprojectingTE" intron2._gff['fsource'] = "ABGPprojectingTE" # create _linked_to_xxx attributes intron1._linked_to_pacbporfs = [pacbporf] intron2._linked_to_pacbporfs = [pacbporf] intron1._linked_to_introns = [intron2] intron2._linked_to_introns = [intron1] #################################################### if verbose: print pacbporf pacbporf.print_protein_and_dna() print intron1 print intron2 if False: # printing data when this function needs to be debugged: print "" print intron1 print intron2 print "" print pacbporfD pacbporfD.print_protein_and_dna() print "" print pacbporf pacbporf.print_protein_and_dna() print "" print pacbporfA pacbporfA.print_protein_and_dna() import sys sys.exit() #################################################### # return introns and intermediate tinyexon PacbPORF return [(intron1, intron2, pacbporf)]
def clustalwinput2cbg(seqs,orfs,coords,nodes, matrix = None, minimal_overall_spanning_range_size = 3, verbose=False): """ @type seqs: dict @param seqs: dict with ORGANISM IDENTIFIER as keys, sequences as values @type orfs: dict @param orfs: dict with ORGANISM IDENTIFIER as keys, Orf objects as values @type coords: dict @param coords: dict with ORGANISM IDENTIFIER as keys, [ sta, end ] as values @type nodes: list @param nodes: list with nodes corresponding to the ORGANISM IDENTIFIER in the dictionaries @attention: coordinates in coords should correspond to the sequneces in seqs! """ # do clustalw and strip_alignment_for_exterior_gaps (algseqs,algm) = clustalw(seqs=seqs) #################################################################### if verbose: print seqs, "\n", algseqs, "\n", algm, "\n", coords #################################################################### _testalgseqs,_testalgm,_testcoords = strip_alignment_for_exterior_gaps( deepcopy(algseqs),deepcopy(algm),deepcopy(coords)) if not _testalgm: #################################################################### if verbose: print "NO ALGM\n", seqs, "\n", _testalgseqs, "\n", _testalgm #################################################################### # alignment completely vanished by `strip_alignment_for_exterior_gaps` return None # do required import here (prevent circular imports) from graphAbgp.graph_codingblock import CodingBlockGraph from graphAbgp.exceptions import NoOverallMinimalSpanningRange from pacb import conversion as pacbconversion if not matrix: raise "No ProteinSimilarityMatrix applied!" # translate the clustalw alignment into an artificial CBG newcbg = CodingBlockGraph() newcbg.add_nodes(nodes) pacbp_is_none = False for nodeA,nodeB in newcbg.pairwisecrosscombinations_node(): orgA = newcbg.organism_by_node(nodeA) orgB = newcbg.organism_by_node(nodeB) # create stripped alignments for this pair of sequences # do not forget to make deepcopies of the data structures! subcoords = { orgA: coords[orgA], orgB: coords[orgB] } subalgseqs = { orgA: algseqs[orgA], orgB: algseqs[orgB] } _algseqs,_algm,_coords = strip_alignment_for_exterior_gaps( deepcopy(subalgseqs),deepcopy(algm),deepcopy(subcoords) ) # recreate a pairwise ClustalW alignment string _algm = make_clustalw_alignment_match( _algseqs[orgA],_algseqs[orgB], matrix = matrix.matrix ) # _algseqs keys are organisms, not nodes! alignment = ( _algseqs[orgA], _algm, _algseqs[orgB] ) paircoords = ( _coords[orgA][0], _coords[orgA][1], _coords[orgB][0], _coords[orgB][1] ) pacbp = pacbconversion.pacbp_from_clustalw( alignment=alignment,coords=paircoords) if pacbp == None: # pacbp is not creatable -> break i.o.t. return None pacbp_is_none = True break pacbporf = pacbconversion.pacbp2pacbporf(pacbp,orfs[orgA],orfs[orgB]) #################################################################### if verbose: print orgA, orgB, pacbporf for item in alignment: print item print paircoords #################################################################### wt = pacbporf.bitscore pacbpkey = pacbporf.construct_unique_key(nodeA,nodeB) newcbg.add_edge(nodeA,nodeB,wt=wt) newcbg.pacbps[(pacbpkey,nodeA,nodeB)] = pacbporf # check if all pacbporfs are created succesfully if pacbp_is_none: return None # update edge weight by OMSR and return newcbg.MINIMAL_OVERAL_SPANNING_RANGE_SIZE =\ minimal_overall_spanning_range_size if newcbg.has_overall_minimal_spanning_range(): newcbg.update_edge_weights_by_minimal_spanning_range() try: newcbg.correct_pacbpgaps_nearby_omsr() return newcbg except NoOverallMinimalSpanningRange: return None else: return None
def WORKING_sprdif2clustalw2cbg(cbg,sprdif,SCAFFOLD_GAP_OMSR_OFFSET=0,verbose=False): """ """ # gather sequence concerning the scaffold gap of the mutual nodes seqs, orfs, coords = {}, {}, {} for node in sprdif.keys(): org = cbg.organism_by_node(node) sta = min( sprdif[node] ) - SCAFFOLD_GAP_OMSR_OFFSET end = max( sprdif[node] ) + SCAFFOLD_GAP_OMSR_OFFSET orf = cbg.get_orfs_of_graph(organism=org)[0] seq = orf.getaas(abs_pos_start=sta,abs_pos_end=end) seqs[org] = seq orfs[org] = orf coords[org] = [sta,end] # do clustalw and strip_alignment_for_exterior_gaps (_algseqs,_algm) = clustalw(seqs=seqs) #################################################################### if verbose: print seqs, "\n", _algseqs, "\n", _algm #################################################################### _algseqs,_algm,coords = strip_alignment_for_exterior_gaps(_algseqs,_algm,coords) if not _algm: #################################################################### if verbose: print "NO ALGM.??\n", seqs, "\n", _algseqs, "\n", _algm #################################################################### # alignment completely vanished by `strip_alignment_for_exterior_gaps` return None # do required import here (prevent circular imports) from graphAbgp.graph_codingblock import CodingBlockGraph from graphAbgp.exceptions import NoOverallMinimalSpanningRange from pacb import conversion as pacbconversion from lib_cexpander import cexpander_checkCBG4omsrbordergaps, ZeroUniformlyAlignedPositions # translate the clustalw alignment into an artificial CBG newcbg = CodingBlockGraph() newcbg.add_nodes(sprdif.keys()) pacbp_is_none = False for nodeA,nodeB in newcbg.pairwisecrosscombinations_node(): orgA = cbg.organism_by_node(nodeA) orgB = cbg.organism_by_node(nodeB) # _algseqs keys are organisms, not nodes! alignment = ( _algseqs[orgA], _algm, _algseqs[orgB] ) paircoords = ( coords[orgA][0], coords[org][1], coords[orgB][0], coords[orgB][1] ) pacbp = pacbconversion.pacbp_from_clustalw(alignment=alignment,coords=paircoords) if pacbp == None: # pacbp is not creatable -> break i.o.t. return None pacbp_is_none = True break pacbporf = pacbconversion.pacbp2pacbporf(pacbp,orfs[orgA],orfs[orgB]) wt = pacbporf.bitscore pacbpkey = pacbporf.construct_unique_key(nodeA,nodeB) newcbg.add_edge(nodeA,nodeB,wt=wt) newcbg.pacbps[(pacbpkey,nodeA,nodeB)] = pacbporf # check if all pacbporfs are created succesfully if pacbp_is_none: return None # update edge weight by OMSR and return newcbg.MINIMAL_OVERAL_SPANNING_RANGE_SIZE = 3 if newcbg.has_overall_minimal_spanning_range(): newcbg.update_edge_weights_by_minimal_spanning_range() try: newcbg.correct_pacbpgaps_nearby_omsr() return newcbg except NoOverallMinimalSpanningRange: return None #try: # status = cexpander_checkCBG4omsrbordergaps(newcbg) # return newcbg #except NoOverallMinimalSpanningRange: # return None #except ZeroUniformlyAlignedPositions: # return None #except: # return None else: return None
def WORKING_sprdif2clustalw2cbg(cbg,sprdif,SCAFFOLD_GAP_OMSR_OFFSET=1,verbose=False): """ """ # gather sequence concerning the scaffold gap of the mutual nodes seqs, orfs, coords = {}, {}, {} for node in sprdif.keys(): org = cbg.organism_by_node(node) sta = min( sprdif[node] ) - SCAFFOLD_GAP_OMSR_OFFSET end = max( sprdif[node] ) + SCAFFOLD_GAP_OMSR_OFFSET orf = cbg.get_orfs_of_graph(organism=org)[0] # correct a priori for out-of-range exceptions # due to SCAFFOLD_GAP_OMSR_OFFSET sta = max([ sta, orf.protein_startPY ]) end = min([ end, orf.protein_endPY ]) seq = orf.getaas(abs_pos_start=sta,abs_pos_end=end) seqs[org] = seq orfs[org] = orf coords[org] = [sta,end] # do clustalw and strip_alignment_for_exterior_gaps (algseqs,algm) = clustalw(seqs=seqs) #################################################################### if verbose: print seqs, "\n", algseqs, "\n", algm, "\n", coords #################################################################### _testalgseqs,_testalgm,_testcoords = strip_alignment_for_exterior_gaps( deepcopy(algseqs),deepcopy(algm),deepcopy(coords)) if not _testalgm: #################################################################### if verbose: print "NO ALGM\n", seqs, "\n", _testalgseqs, "\n", _testalgm #################################################################### # alignment completely vanished by `strip_alignment_for_exterior_gaps` return None # do required import here (prevent circular imports) from graphAbgp.graph_codingblock import CodingBlockGraph from graphAbgp.exceptions import NoOverallMinimalSpanningRange from pacb import conversion as pacbconversion from lib_cexpander import cexpander_checkCBG4omsrbordergaps, ZeroUniformlyAlignedPositions # translate the clustalw alignment into an artificial CBG newcbg = CodingBlockGraph() newcbg.add_nodes(sprdif.keys()) pacbp_is_none = False for nodeA,nodeB in newcbg.pairwisecrosscombinations_node(): orgA = cbg.organism_by_node(nodeA) orgB = cbg.organism_by_node(nodeB) # create stripped alignments for this pair of sequences # do not forget to make deepcopies of the data structures! subcoords = { orgA: coords[orgA], orgB: coords[orgB] } subalgseqs = { orgA: algseqs[orgA], orgB: algseqs[orgB] } _algseqs,_algm,_coords = strip_alignment_for_exterior_gaps( deepcopy(subalgseqs),deepcopy(algm),deepcopy(subcoords) ) # get a/the ProteinSimilarityMatrix from the original PacbP(ORF) # and then recreate a pairwise ClustalW alignment string protsimmtrx = cbg.get_pacbps_by_nodes(node1=nodeA,node2=nodeB)[0].MATRIX _algm = make_clustalw_alignment_match( _algseqs[orgA],_algseqs[orgB], matrix = protsimmtrx.matrix ) # _algseqs keys are organisms, not nodes! alignment = ( _algseqs[orgA], _algm, _algseqs[orgB] ) paircoords = ( _coords[orgA][0], _coords[orgA][1], _coords[orgB][0], _coords[orgB][1] ) pacbp = pacbconversion.pacbp_from_clustalw( alignment=alignment,coords=paircoords) if pacbp == None: # pacbp is not creatable -> break i.o.t. return None pacbp_is_none = True break pacbporf = pacbconversion.pacbp2pacbporf(pacbp,orfs[orgA],orfs[orgB]) #################################################################### if verbose: print orgA, orgB, pacbporf for item in alignment: print item print paircoords #################################################################### wt = pacbporf.bitscore pacbpkey = pacbporf.construct_unique_key(nodeA,nodeB) newcbg.add_edge(nodeA,nodeB,wt=wt) newcbg.pacbps[(pacbpkey,nodeA,nodeB)] = pacbporf # check if all pacbporfs are created succesfully if pacbp_is_none: return None # update edge weight by OMSR and return newcbg.MINIMAL_OVERAL_SPANNING_RANGE_SIZE = 3 if newcbg.has_overall_minimal_spanning_range(): newcbg.update_edge_weights_by_minimal_spanning_range() try: newcbg.correct_pacbpgaps_nearby_omsr() return newcbg except NoOverallMinimalSpanningRange: return None else: return None
def _find_qp_and_pq_tinyexons_as_pacbporfs(target,tinyexondata,PCG,min_discovery_count=2): """ """ target_tinyexon_pacbporf_data = {} for informant in tinyexondata.keys(): if informant == target: continue thepacbporfs = order_pacbporf_list( PCG.get_pacbps_by_organisms(target,informant)) for exonQ in tinyexondata[target]: if exonQ.orf.id in [ pf.orfQ.id for pf in thepacbporfs ]: continue for orfObj in PCG.get_orfs_of_graph(organism=informant): tinyexonmatches = _find_qp_or_pq_match_on_orfobj(exonQ,orfObj) for (aaseq,aapos) in tinyexonmatches: # make pacbporf object pacbpobj = PacbP(input=( exonQ.proteinsequence(), aaseq, exonQ.orf.dnapos2aapos(exonQ.start), aapos ) ) pacbporfobj = pacbp2pacbporf(pacbpobj,exonQ.orf,orfObj) pacbporfobj.extend_pacbporf_after_stops() # remove included pacbporfs is_suborsuperset = False for accepted_pacbporf in thepacbporfs: if pacbporfobj.issubsetorsuperset(accepted_pacbporf): is_suborsuperset = True break if is_suborsuperset: continue # check if a (perfect) intron can be projected is_confirmed_by_intron_projection = False for accepted_pacbporf in thepacbporfs: if accepted_pacbporf.orfS.id == pacbporfobj.orfS.id: if min(accepted_pacbporf.alignment_dna_range_query()) > min(pacbporfobj.alignment_dna_range_query()): try: introns = merge_pacbporfs_by_intron_in_query( pacbporfobj,accepted_pacbporf, max_aa_offset=0, max_intron_nt_length=None) #max_intron_nt_length=140) except IndexError: # unexpected event: TODO: solve in merge_pacbporfs_by_intron_in_query introns = [] else: try: introns = merge_pacbporfs_by_intron_in_query( accepted_pacbporf,pacbporfobj, max_aa_offset=0, max_intron_nt_length=None) #max_intron_nt_length=140) except IndexError: # unexpected event: TODO: solve in merge_pacbporfs_by_intron_in_query introns = [] if len(introns) >= 1: is_confirmed_by_intron_projection = True break # continue if not is_confirmed_by_intron_projection if not is_confirmed_by_intron_projection: continue # check if placeable in PCG/pacbporflist rejected = [ pf.is_postioned_compatibly(pacbporfobj) for pf in thepacbporfs ].count(False) > 0 # label pacbporf as found by tinyexon QP pacbporfobj._tinyexon_label = "QP" # store to target_tinyexon_pacbporf_data key = (exonQ.proteinsequence(),exonQ.start) _update_tinyexon_pacbporf_dict( target_tinyexon_pacbporf_data, key,pacbporfobj,rejected,informant) # cleanup tinyexon protein matches that have been observed to litte _remove_dict_elements_with_short_value_list( target_tinyexon_pacbporf_data, min_value_list_size=min_discovery_count) # return target_tinyexon_pacbporf_data return target_tinyexon_pacbporf_data
def update_PCG_with_signalpexons(signalpexonseqs,PCG,OPTIONS, min_pacbporf_identityscore=0.20,verbose=True): """ """ if not signalpexonseqs.has_key(OPTIONS.target): return False is_any_pacbporf_added = False for targetSPexon in signalpexonseqs[OPTIONS.target]: target = OPTIONS.target for informant,infSPlist in signalpexonseqs.iteritems(): if informant == OPTIONS.target: continue # check if informant has been deleted in the meanwhile if informant not in PCG.organism_set(): continue # list to store signalp exons into signalpexon_pacbp_list = [] # get ordered pacbporfs fromt he PCG thepacbporfs = order_pacbporf_list(PCG.get_pacbps_by_organisms(OPTIONS.target,informant)) if not thepacbporfs: # no alignments present for this organism (can happen!) continue for informantSPexon in infSPlist: coords = [ targetSPexon.protein_start(), targetSPexon.protein_end(), informantSPexon.protein_start(), informantSPexon.protein_end(), ] # prior to making ClustalW-PacbP, check PacbPCOORD placeability # into the list of pacbporfs pacbpCoordsObj = PacbPCOORDS(input=( targetSPexon.proteinsequence(), informantSPexon.proteinsequence(), targetSPexon.protein_start(), informantSPexon.protein_start(), ) ) if False in [ pacbpCoordsObj.is_positioned_compatibly(pacbporf) for pacbporf in thepacbporfs ]: # *NOT* placable in current ordered list of PacbPORFS continue dist = pacbpCoordsObj.distance_towards(thepacbporfs[0]) if dist > SIGNALP_FIRSTEXON_MAX_INTRON_NT_LENGTH/3: # WAY TO FAR in front of current gene structure parts. # Do not allow (pooras a *NOT* placable in current ordered list of PacbPORFS continue elif dist == 0: # NOT placeable in front of the rest of the PacbPORFS. continue else: pass # perform ClustalW alignment on the SP exons (alignedseqs,alignment) =\ clustalw( seqs= { OPTIONS.target: targetSPexon.proteinsequence(), informant: informantSPexon.proteinsequence() } ) # make pacbp from clustalw alignment pacbp = pacbp_from_clustalw( alignment=( alignedseqs[OPTIONS.target], alignment, alignedseqs[informant] ), coords=coords ) # is there any alignment constructed? if not pacbp: continue # ignore (very) poor identyscore alignments if pacbp.identityscore < min_pacbporf_identityscore: continue # if here make extended pacbpORF signalpexonPacbpORF = pacbp2pacbporf(pacbp, targetSPexon.orf,informantSPexon.orf) signalpexonPacbpORF.extend_pacbporf_after_stops() # and store in signalpexon_pacbp_list signalpexon_pacbp_list.append( signalpexonPacbpORF ) ################################################################ if verbose: print alignedseqs[OPTIONS.target], OPTIONS.target print alignment print alignedseqs[informant], informant if pacbp: print pacbp, (OPTIONS.target, targetSPexon.orf.id), print (informant, informantSPexon.orf.id), print "DISTANCE::", dist pacbp.print_protein() print "" ################################################################ # If there are signalpexon-guided pacbporfs found, store the one # with the highest bitscore if signalpexon_pacbp_list: signalpexon_pacbp_list = order_list_by_attribute( signalpexon_pacbp_list,order_by='bits',reversed=True) # store best bitscoring pacbporf to PCG signalp_pacbporf = signalpexon_pacbp_list[0] pacbporf2PCG(signalp_pacbporf,OPTIONS.target,informant,PCG,source='SignalP-ClustalW') is_any_pacbporf_added = True #################################################################### if verbose: print "SignalP Exon added to PCG:", signalp_pacbporf, informant #################################################################### else: pass # return pointer is_any_pacbporf_added return is_any_pacbporf_added
def _find_qq_tinyexons_as_pacbporfs(target,tinyexondata,PCG,min_discovery_count=2): """ """ target_tinyexon_pacbporf_data = {} for informant in tinyexondata.keys(): if informant == target: continue thepacbporfs = order_pacbporf_list( PCG.get_pacbps_by_organisms(target,informant)) for exonQ in tinyexondata[target]: if exonQ.orf.id in [ pf.orfQ.id for pf in thepacbporfs ]: continue for (prevpos,nextpos) in [ (pos-1,pos) for pos in range(1,len(thepacbporfs)) ]: prevPF = thepacbporfs[prevpos] nextPF = thepacbporfs[nextpos] if prevPF.orfS.id == nextPF.orfS.id: # check if PacbPORFs are positioned more or less okay if prevPF.distance_towards(nextPF) > 20: continue # check if exonQ is positioned ~between these PacbPORFs if exonQ.orf.dnapos2aapos(exonQ.end) < max(prevPF.alignment_protein_range_query())-12: continue if exonQ.orf.dnapos2aapos(exonQ.start) > min(nextPF.alignment_protein_range_query())+12: continue # check if gap can be projected already by a perfect intron introns = merge_pacbporfs_by_intron_in_query( prevPF,nextPF,max_aa_offset=1) # if introns found => continue if introns: continue # orfObj is the orfS of prevPF or nextPF (just take any) orfObj = prevPF.orfS # assign elegiable range of tinyexon match on SBJCT aapos_sbjct_range = range( max(prevPF.alignment_protein_range_sbjct())-12, min(nextPF.alignment_protein_range_sbjct())+12 ) tinyexonmatches = _find_match_on_orfobj(exonQ,orfObj) for (aaseq,aapos) in tinyexonmatches: # check if the match is obtained in the expected # sbjct AA range; if not, ignore the match if aapos not in aapos_sbjct_range: continue # make pacbporf object pacbpobj = PacbP(input=( exonQ.proteinsequence(), aaseq, exonQ.orf.dnapos2aapos(exonQ.start), aapos ) ) pacbporfobj = pacbp2pacbporf(pacbpobj,exonQ.orf,orfObj) pacbporfobj.extend_pacbporf_after_stops() # remove included pacbporfs is_suborsuperset = False for accepted_pacbporf in thepacbporfs: if pacbporfobj.issubsetorsuperset(accepted_pacbporf): is_suborsuperset = True break if is_suborsuperset: continue # check if 2 (perfect) introns can be projected introns5p = merge_pacbporfs_by_intron_in_query( prevPF,pacbporfobj, max_aa_offset=1, max_intron_nt_length=None) #max_intron_nt_length=140) introns3p = merge_pacbporfs_by_intron_in_query( pacbporfobj,nextPF, max_aa_offset=1, max_intron_nt_length=None) #max_intron_nt_length=140) # continue if not is_confirmed_by_intron_projection if not introns5p or not introns3p: continue # check if placeable in PCG/pacbporflist distPrev = prevPF.distance_towards(pacbporfobj) distNext = pacbporfobj.distance_towards(nextPF) ovrlPrev = pacbporfobj.overlap(prevPF) ovrlNext = pacbporfobj.overlap(nextPF) if distPrev and distNext: rejected = False elif not distPrev and ovrlPrev: rejected = False elif not distNext and ovrlNext: rejected = False elif ovrlPrev and ovrlNext: rejected = False else: rejected = True print "OKAY", exonQ.proteinsequence(), aaseq, rejected, informant, (distPrev,distNext,ovrlPrev,ovrlNext) # label pacbporf as found by tinyexon QQ pacbporfobj._tinyexon_label = "QQ" # store to target_tinyexon_pacbporf_data key = (exonQ.proteinsequence(),exonQ.start) _update_tinyexon_pacbporf_dict( target_tinyexon_pacbporf_data, key,pacbporfobj,rejected,informant) # cleanup tinyexon protein matches that have been observed to litte _remove_dict_elements_with_short_value_list( target_tinyexon_pacbporf_data, min_value_list_size=min_discovery_count) # return target_tinyexon_pacbporf_data return target_tinyexon_pacbporf_data
def merge_pacbporfs_by_tinyexons(pacbporfD,pacbporfA, orfSetObjQ,orfSetObjS,verbose=False,**kwargs): """ """ # input validation IsPacbPORF(pacbporfD) IsPacbPORF(pacbporfA) # edit **kwargs dictionary for some forced attributes _update_kwargs(kwargs,KWARGS_MAPPED_INTRON) if not kwargs.has_key('aligned_site_max_triplet_distance'): kwargs['aligned_site_max_triplet_distance'] = kwargs['max_aa_offset'] # settings for minimal alignment entropy score min_donor_site_alignment_entropy = 0.0 min_acceptor_site_alignment_entropy = 0.0 resultlistQ = merge_orfs_with_tinyexon( pacbporfD.orfQ,pacbporfA.orfQ, preceding_donor_sites=pacbporfD.orfQ._donor_sites, subsequent_acceptor_sites=pacbporfA.orfQ._acceptor_sites, orflist=orfSetObjQ.orfs,**kwargs) resultlistS = merge_orfs_with_tinyexon( pacbporfD.orfS,pacbporfA.orfS, preceding_donor_sites=pacbporfD.orfS._donor_sites, subsequent_acceptor_sites=pacbporfA.orfS._acceptor_sites, orflist=orfSetObjS.orfs,**kwargs) # translate resultlists to dict: key == exon, value = [ {intronsD},{intronsS} ] resultdictQ,key2exonQ = _tinyexon_list_2_dict(resultlistQ) resultdictS,key2exonS = _tinyexon_list_2_dict(resultlistS) # get unique list of donors & acceptors donorQ = olba( list(Set([inD.donor for inD,te,inA in resultlistQ ])), order_by='pos') donorS = olba( list(Set([inD.donor for inD,te,inA in resultlistS ])), order_by='pos') accepQ = olba( list(Set([inA.acceptor for inD,te,inA in resultlistQ ])), order_by='pos') accepS = olba( list(Set([inA.acceptor for inD,te,inA in resultlistS ])), order_by='pos') ## filter for alignable donor & acceptor sites kwargs['allow_non_canonical'] = True # True kwargs['aligned_site_max_triplet_distance'] = 0 # 2 algdonors = _filter_for_alignable_splice_sites(donorQ,donorS,pacbporfD,**kwargs) algacceps = _filter_for_alignable_splice_sites(accepQ,accepS,pacbporfA,**kwargs) # settings for minimal alignment entropy score # TODO TODO -> THIS MUST BE FIXED TO A NICE THRESHOLD VALUE!!! min_donor_site_alignment_entropy = 0.1 min_acceptor_site_alignment_entropy = 0.1 # remove sites with to low alignment entropy algdonors = _filter_for_entropy(algdonors,pacbporfD,'donor', min_alignment_entropy=min_donor_site_alignment_entropy) algacceps = _filter_for_entropy(algacceps,pacbporfA,'acceptor', min_alignment_entropy=min_acceptor_site_alignment_entropy) # return list: intronQD,intronSD,tinyexon,intronAQ,intronAS return_list = [] ############################################################################ if verbose: print "bridges constructed: ORFS:", print (pacbporfD.orfQ.id,pacbporfA.orfQ.id), print (pacbporfD.orfS.id,pacbporfA.orfS.id), print len(resultdictQ), len(resultdictS), print ( len(resultlistQ), len(donorQ), len(accepQ) ), print ( len(resultlistS), len(donorS), len(accepS) ), print ( len(algdonors), len(algacceps) ) ############################################################################ for keyQ,tinyexonQ in key2exonQ.iteritems(): for keyS,tinyexonS in key2exonS.iteritems(): if tinyexonQ.donor.phase != tinyexonS.donor.phase: continue if tinyexonQ.acceptor.phase != tinyexonS.acceptor.phase: continue if tinyexonQ.length != tinyexonS.length: continue # if here, then tinyexons of identical structure #################################################################### if verbose: print tinyexonQ.length, tinyexonQ.donor.phase, print ( len(resultdictQ[keyQ][0]), len(resultdictQ[keyQ][1]) ), print ( len(resultdictS[keyS][0]), len(resultdictS[keyS][1]) ), print tinyexonQ, print tinyexonQ.proteinsequence(), tinyexonS.proteinsequence(), print tinyexonS.acceptor.pssm_score + tinyexonS.donor.pssm_score #################################################################### donor_introns = [] acceptor_introns = [] for intronDQkey, intronDQ in resultdictQ[keyQ][0].iteritems(): if intronDQ.donor.pos not in [ dQ.pos for dQ,dS in algdonors ]: continue for intronDSkey, intronDS in resultdictS[keyS][0].iteritems(): if intronDS.donor.pos not in [ dS.pos for dQ,dS in algdonors ]: continue # check if they exists as aligned sites alignedkey = ( intronDQ.donor.pos, intronDS.donor.pos ) if alignedkey not in [ (dQ.pos, dS.pos) for dQ,dS in algdonors ]: continue # if here, we have a set of introns 5' of the tinyexon # which are perfectly alignable! donor_introns.append((intronDQ,intronDS)) for intronAQkey, intronAQ in resultdictQ[keyQ][1].iteritems(): if intronAQ.acceptor.pos not in [ aQ.pos for aQ,aS in algacceps ]: continue for intronASkey, intronAS in resultdictS[keyS][1].iteritems(): if intronAS.acceptor.pos not in [ aS.pos for aQ,aS in algacceps ]: continue # check if they exists as aligned sites alignedkey = ( intronAQ.acceptor.pos, intronAS.acceptor.pos ) if alignedkey not in [ (aQ.pos, aS.pos) for aQ,aS in algacceps ]: continue # if here, we have a set of introns 3' of the tinyexon # which are perfectly alignable! acceptor_introns.append((intronAQ,intronAS)) if not len(donor_introns) or not len(acceptor_introns): # no aligned 5' && aligned 3' introns continue # initialize extended tinyexon PacbPORF from pacb import PacbP pacbp = PacbP(input=( tinyexonQ.proteinsequence(), tinyexonS.proteinsequence(), tinyexonQ.protein_start(), tinyexonS.protein_start(), ) ) pacbp.strip_unmatched_ends() # continue if no fraction could be aligned if len(pacbp) == 0: continue tinypacbporf = pacbp2pacbporf(pacbp,tinyexonQ.orf,tinyexonS.orf) tinypacbporf.extend_pacbporf_after_stops() #################################################################### if verbose: print tinypacbporf tinypacbporf.print_protein_and_dna() print len(donor_introns), len(acceptor_introns), print max([ dQ.donor.pssm_score+dS.donor.pssm_score for dQ,dS in donor_introns]), print max([ aQ.acceptor.pssm_score+aS.acceptor.pssm_score for aQ,aS in acceptor_introns]) #################################################################### # if here, we have accepted tinyexon bridges! # gather them and store to return_list for intronDQkey, intronDQ in resultdictQ[keyQ][0].iteritems(): if intronDQ.donor.pos not in [ dQ.pos for dQ,dS in algdonors ]: continue for intronDSkey, intronDS in resultdictS[keyS][0].iteritems(): if intronDS.donor.pos not in [ dS.pos for dQ,dS in algdonors ]: continue for intronAQkey, intronAQ in resultdictQ[keyQ][1].iteritems(): if intronAQ.acceptor.pos not in [ aQ.pos for aQ,aS in algacceps ]: continue for intronASkey, intronAS in resultdictS[keyS][1].iteritems(): if intronAS.acceptor.pos not in [ aS.pos for aQ,aS in algacceps ]: continue #################################################### # set some meta-data properties to the intron objects #################################################### _score_introns_obtained_by_mapping( intronDQ,intronDS,pacbporfD, tinypacbporf,source='ABGPmappingTE') _score_introns_obtained_by_mapping( intronAQ,intronAS,tinypacbporf, pacbporfA,source='ABGPmappingTE') # create _linked_to_xxx attributes intronDQ._linked_to_pacbporfs = [ tinypacbporf ] intronAQ._linked_to_pacbporfs = [ tinypacbporf ] intronDS._linked_to_pacbporfs = [ tinypacbporf ] intronAS._linked_to_pacbporfs = [ tinypacbporf ] intronDQ._linked_to_introns = [ intronAQ ] intronAQ._linked_to_introns = [ intronDQ ] intronDS._linked_to_introns = [ intronAS ] intronAS._linked_to_introns = [ intronDS ] # append to tmp result list return_list.append( (intronDQ,intronDS,tinypacbporf,intronAQ,intronAS) ) # check if there are >1 candidate tiny exons # currently, we choose only to return the **best** mapped tinyexon if len(return_list) == 0: pass elif len(return_list) == 1: pass else: # only take the highest scoring candidate here min_distance = min([ (a._distance+d._distance) for a,b,c,d,e in return_list ]) pos2score = [] for (intronDQ,intronDS,tinypacbporf,intronAQ,intronAS) in return_list: if (intronDQ._distance + intronAQ._distance) > min_distance: pos2score.append( 0.0 ) else: # calculate overall pssm score total_pssm = 0.0 total_pssm += intronDQ.donor.pssm_score total_pssm += intronDQ.acceptor.pssm_score total_pssm += intronDS.donor.pssm_score total_pssm += intronDS.acceptor.pssm_score total_pssm += intronAQ.donor.pssm_score total_pssm += intronAQ.acceptor.pssm_score total_pssm += intronAS.donor.pssm_score total_pssm += intronAS.acceptor.pssm_score pos2score.append( total_pssm ) # get highest score and linked tinyexon max_score = max(pos2score) return_list = [ return_list[pos2score.index(max_score)] ] ############################################################################ # some printing in verbose mode if verbose and return_list: (intronDQ,intronDS,tinypacbporf,intronAQ,intronAS) = return_list[0] print "BEST MAPPED TINYEXON:" print tinypacbporf print tinypacbporf.query, intronDQ._distance, intronAQ._distance, print ( intronDQ.donor.pos, intronDQ.acceptor.pos ), print ( intronDS.donor.pos, intronDS.acceptor.pos ), print ( intronAQ.donor.pos, intronAQ.acceptor.pos ), print ( intronAS.donor.pos, intronAS.acceptor.pos ) ############################################################################ # return the result list return return_list
def update_PCG_with_signalpexons(signalpexonseqs, PCG, OPTIONS, min_pacbporf_identityscore=0.20, verbose=True): """ """ if not signalpexonseqs.has_key(OPTIONS.target): return False is_any_pacbporf_added = False for targetSPexon in signalpexonseqs[OPTIONS.target]: target = OPTIONS.target for informant, infSPlist in signalpexonseqs.iteritems(): if informant == OPTIONS.target: continue # check if informant has been deleted in the meanwhile if informant not in PCG.organism_set(): continue # list to store signalp exons into signalpexon_pacbp_list = [] # get ordered pacbporfs fromt he PCG thepacbporfs = order_pacbporf_list( PCG.get_pacbps_by_organisms(OPTIONS.target, informant)) if not thepacbporfs: # no alignments present for this organism (can happen!) continue for informantSPexon in infSPlist: coords = [ targetSPexon.protein_start(), targetSPexon.protein_end(), informantSPexon.protein_start(), informantSPexon.protein_end(), ] # prior to making ClustalW-PacbP, check PacbPCOORD placeability # into the list of pacbporfs pacbpCoordsObj = PacbPCOORDS(input=( targetSPexon.proteinsequence(), informantSPexon.proteinsequence(), targetSPexon.protein_start(), informantSPexon.protein_start(), )) if False in [ pacbpCoordsObj.is_positioned_compatibly(pacbporf) for pacbporf in thepacbporfs ]: # *NOT* placable in current ordered list of PacbPORFS continue dist = pacbpCoordsObj.distance_towards(thepacbporfs[0]) if dist > SIGNALP_FIRSTEXON_MAX_INTRON_NT_LENGTH / 3: # WAY TO FAR in front of current gene structure parts. # Do not allow (pooras a *NOT* placable in current ordered list of PacbPORFS continue elif dist == 0: # NOT placeable in front of the rest of the PacbPORFS. continue else: pass # perform ClustalW alignment on the SP exons (alignedseqs,alignment) =\ clustalw( seqs= { OPTIONS.target: targetSPexon.proteinsequence(), informant: informantSPexon.proteinsequence() } ) # make pacbp from clustalw alignment pacbp = pacbp_from_clustalw( alignment=(alignedseqs[OPTIONS.target], alignment, alignedseqs[informant]), coords=coords) # is there any alignment constructed? if not pacbp: continue # ignore (very) poor identyscore alignments if pacbp.identityscore < min_pacbporf_identityscore: continue # if here make extended pacbpORF signalpexonPacbpORF = pacbp2pacbporf(pacbp, targetSPexon.orf, informantSPexon.orf) signalpexonPacbpORF.extend_pacbporf_after_stops() # and store in signalpexon_pacbp_list signalpexon_pacbp_list.append(signalpexonPacbpORF) ################################################################ if verbose: print alignedseqs[OPTIONS.target], OPTIONS.target print alignment print alignedseqs[informant], informant if pacbp: print pacbp, (OPTIONS.target, targetSPexon.orf.id), print(informant, informantSPexon.orf.id), print "DISTANCE::", dist pacbp.print_protein() print "" ################################################################ # If there are signalpexon-guided pacbporfs found, store the one # with the highest bitscore if signalpexon_pacbp_list: signalpexon_pacbp_list = order_list_by_attribute( signalpexon_pacbp_list, order_by='bits', reversed=True) # store best bitscoring pacbporf to PCG signalp_pacbporf = signalpexon_pacbp_list[0] pacbporf2PCG(signalp_pacbporf, OPTIONS.target, informant, PCG, source='SignalP-ClustalW') is_any_pacbporf_added = True #################################################################### if verbose: print "SignalP Exon added to PCG:", signalp_pacbporf, informant #################################################################### else: pass # return pointer is_any_pacbporf_added return is_any_pacbporf_added
def merge_pacbporfs_by_tinyexons(pacbporfD, pacbporfA, orfSetObjQ, orfSetObjS, verbose=False, **kwargs): """ """ # input validation IsPacbPORF(pacbporfD) IsPacbPORF(pacbporfA) # edit **kwargs dictionary for some forced attributes _update_kwargs(kwargs, KWARGS_MAPPED_INTRON) if not kwargs.has_key('aligned_site_max_triplet_distance'): kwargs['aligned_site_max_triplet_distance'] = kwargs['max_aa_offset'] # settings for minimal alignment entropy score min_donor_site_alignment_entropy = 0.0 min_acceptor_site_alignment_entropy = 0.0 resultlistQ = merge_orfs_with_tinyexon( pacbporfD.orfQ, pacbporfA.orfQ, preceding_donor_sites=pacbporfD.orfQ._donor_sites, subsequent_acceptor_sites=pacbporfA.orfQ._acceptor_sites, orflist=orfSetObjQ.orfs, **kwargs) resultlistS = merge_orfs_with_tinyexon( pacbporfD.orfS, pacbporfA.orfS, preceding_donor_sites=pacbporfD.orfS._donor_sites, subsequent_acceptor_sites=pacbporfA.orfS._acceptor_sites, orflist=orfSetObjS.orfs, **kwargs) # translate resultlists to dict: key == exon, value = [ {intronsD},{intronsS} ] resultdictQ, key2exonQ = _tinyexon_list_2_dict(resultlistQ) resultdictS, key2exonS = _tinyexon_list_2_dict(resultlistS) # get unique list of donors & acceptors donorQ = olba(list(Set([inD.donor for inD, te, inA in resultlistQ])), order_by='pos') donorS = olba(list(Set([inD.donor for inD, te, inA in resultlistS])), order_by='pos') accepQ = olba(list(Set([inA.acceptor for inD, te, inA in resultlistQ])), order_by='pos') accepS = olba(list(Set([inA.acceptor for inD, te, inA in resultlistS])), order_by='pos') ## filter for alignable donor & acceptor sites kwargs['allow_non_canonical'] = True # True kwargs['aligned_site_max_triplet_distance'] = 0 # 2 algdonors = _filter_for_alignable_splice_sites(donorQ, donorS, pacbporfD, **kwargs) algacceps = _filter_for_alignable_splice_sites(accepQ, accepS, pacbporfA, **kwargs) # settings for minimal alignment entropy score # TODO TODO -> THIS MUST BE FIXED TO A NICE THRESHOLD VALUE!!! min_donor_site_alignment_entropy = 0.1 min_acceptor_site_alignment_entropy = 0.1 # remove sites with to low alignment entropy algdonors = _filter_for_entropy( algdonors, pacbporfD, 'donor', min_alignment_entropy=min_donor_site_alignment_entropy) algacceps = _filter_for_entropy( algacceps, pacbporfA, 'acceptor', min_alignment_entropy=min_acceptor_site_alignment_entropy) # return list: intronQD,intronSD,tinyexon,intronAQ,intronAS return_list = [] ############################################################################ if verbose: print "bridges constructed: ORFS:", print(pacbporfD.orfQ.id, pacbporfA.orfQ.id), print(pacbporfD.orfS.id, pacbporfA.orfS.id), print len(resultdictQ), len(resultdictS), print(len(resultlistQ), len(donorQ), len(accepQ)), print(len(resultlistS), len(donorS), len(accepS)), print(len(algdonors), len(algacceps)) ############################################################################ for keyQ, tinyexonQ in key2exonQ.iteritems(): for keyS, tinyexonS in key2exonS.iteritems(): if tinyexonQ.donor.phase != tinyexonS.donor.phase: continue if tinyexonQ.acceptor.phase != tinyexonS.acceptor.phase: continue if tinyexonQ.length != tinyexonS.length: continue # if here, then tinyexons of identical structure #################################################################### if verbose: print tinyexonQ.length, tinyexonQ.donor.phase, print(len(resultdictQ[keyQ][0]), len(resultdictQ[keyQ][1])), print(len(resultdictS[keyS][0]), len(resultdictS[keyS][1])), print tinyexonQ, print tinyexonQ.proteinsequence(), tinyexonS.proteinsequence(), print tinyexonS.acceptor.pssm_score + tinyexonS.donor.pssm_score #################################################################### donor_introns = [] acceptor_introns = [] for intronDQkey, intronDQ in resultdictQ[keyQ][0].iteritems(): if intronDQ.donor.pos not in [dQ.pos for dQ, dS in algdonors]: continue for intronDSkey, intronDS in resultdictS[keyS][0].iteritems(): if intronDS.donor.pos not in [ dS.pos for dQ, dS in algdonors ]: continue # check if they exists as aligned sites alignedkey = (intronDQ.donor.pos, intronDS.donor.pos) if alignedkey not in [(dQ.pos, dS.pos) for dQ, dS in algdonors]: continue # if here, we have a set of introns 5' of the tinyexon # which are perfectly alignable! donor_introns.append((intronDQ, intronDS)) for intronAQkey, intronAQ in resultdictQ[keyQ][1].iteritems(): if intronAQ.acceptor.pos not in [ aQ.pos for aQ, aS in algacceps ]: continue for intronASkey, intronAS in resultdictS[keyS][1].iteritems(): if intronAS.acceptor.pos not in [ aS.pos for aQ, aS in algacceps ]: continue # check if they exists as aligned sites alignedkey = (intronAQ.acceptor.pos, intronAS.acceptor.pos) if alignedkey not in [(aQ.pos, aS.pos) for aQ, aS in algacceps]: continue # if here, we have a set of introns 3' of the tinyexon # which are perfectly alignable! acceptor_introns.append((intronAQ, intronAS)) if not len(donor_introns) or not len(acceptor_introns): # no aligned 5' && aligned 3' introns continue # initialize extended tinyexon PacbPORF from pacb import PacbP pacbp = PacbP(input=( tinyexonQ.proteinsequence(), tinyexonS.proteinsequence(), tinyexonQ.protein_start(), tinyexonS.protein_start(), )) pacbp.strip_unmatched_ends() # continue if no fraction could be aligned if len(pacbp) == 0: continue tinypacbporf = pacbp2pacbporf(pacbp, tinyexonQ.orf, tinyexonS.orf) tinypacbporf.extend_pacbporf_after_stops() #################################################################### if verbose: print tinypacbporf tinypacbporf.print_protein_and_dna() print len(donor_introns), len(acceptor_introns), print max([ dQ.donor.pssm_score + dS.donor.pssm_score for dQ, dS in donor_introns ]), print max([ aQ.acceptor.pssm_score + aS.acceptor.pssm_score for aQ, aS in acceptor_introns ]) #################################################################### # if here, we have accepted tinyexon bridges! # gather them and store to return_list for intronDQkey, intronDQ in resultdictQ[keyQ][0].iteritems(): if intronDQ.donor.pos not in [dQ.pos for dQ, dS in algdonors]: continue for intronDSkey, intronDS in resultdictS[keyS][0].iteritems(): if intronDS.donor.pos not in [ dS.pos for dQ, dS in algdonors ]: continue for intronAQkey, intronAQ in resultdictQ[keyQ][ 1].iteritems(): if intronAQ.acceptor.pos not in [ aQ.pos for aQ, aS in algacceps ]: continue for intronASkey, intronAS in resultdictS[keyS][ 1].iteritems(): if intronAS.acceptor.pos not in [ aS.pos for aQ, aS in algacceps ]: continue #################################################### # set some meta-data properties to the intron objects #################################################### _score_introns_obtained_by_mapping( intronDQ, intronDS, pacbporfD, tinypacbporf, source='ABGPmappingTE') _score_introns_obtained_by_mapping( intronAQ, intronAS, tinypacbporf, pacbporfA, source='ABGPmappingTE') # create _linked_to_xxx attributes intronDQ._linked_to_pacbporfs = [tinypacbporf] intronAQ._linked_to_pacbporfs = [tinypacbporf] intronDS._linked_to_pacbporfs = [tinypacbporf] intronAS._linked_to_pacbporfs = [tinypacbporf] intronDQ._linked_to_introns = [intronAQ] intronAQ._linked_to_introns = [intronDQ] intronDS._linked_to_introns = [intronAS] intronAS._linked_to_introns = [intronDS] # append to tmp result list return_list.append( (intronDQ, intronDS, tinypacbporf, intronAQ, intronAS)) # check if there are >1 candidate tiny exons # currently, we choose only to return the **best** mapped tinyexon if len(return_list) == 0: pass elif len(return_list) == 1: pass else: # only take the highest scoring candidate here min_distance = min([(a._distance + d._distance) for a, b, c, d, e in return_list]) pos2score = [] for (intronDQ, intronDS, tinypacbporf, intronAQ, intronAS) in return_list: if (intronDQ._distance + intronAQ._distance) > min_distance: pos2score.append(0.0) else: # calculate overall pssm score total_pssm = 0.0 total_pssm += intronDQ.donor.pssm_score total_pssm += intronDQ.acceptor.pssm_score total_pssm += intronDS.donor.pssm_score total_pssm += intronDS.acceptor.pssm_score total_pssm += intronAQ.donor.pssm_score total_pssm += intronAQ.acceptor.pssm_score total_pssm += intronAS.donor.pssm_score total_pssm += intronAS.acceptor.pssm_score pos2score.append(total_pssm) # get highest score and linked tinyexon max_score = max(pos2score) return_list = [return_list[pos2score.index(max_score)]] ############################################################################ # some printing in verbose mode if verbose and return_list: (intronDQ, intronDS, tinypacbporf, intronAQ, intronAS) = return_list[0] print "BEST MAPPED TINYEXON:" print tinypacbporf print tinypacbporf.query, intronDQ._distance, intronAQ._distance, print(intronDQ.donor.pos, intronDQ.acceptor.pos), print(intronDS.donor.pos, intronDS.acceptor.pos), print(intronAQ.donor.pos, intronAQ.acceptor.pos), print(intronAS.donor.pos, intronAS.acceptor.pos) ############################################################################ # return the result list return return_list